IMPORTAMOS LAS LIBRERÍAS
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)
FUNCIONES
def plot_feature(df, col_name, isContinuous, target):
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def dame_variables_categoricas(dataset=None):
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if dataset[i].dtype == object:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
if dataset[i].dtype == int:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 10:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_loan[i].isnull().sum()>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def cramers_v(confusion_matrix):
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
Con estas funciones estamos tratando de hacer varias cosas:
plot_feature: va a generar gráficos específicos para cada variable donde hará un histograma para analizar la distribución y boxplot para ver cómo se distribuyen los valores en base a la variable TARGET para las variables contínuas. Para las variables categóricas va a crear dos gráficos de barras, para la frecuencia de la variable y otro con las proporciones por categoría de TARGET
dame_variables_categoricas: trata de identificar variables categóricas y separarlas en dos: aquellas variables que tienen menos de 100 valores únicos (estas pueden ser muy útiles en nuestro análisis), y aquellas que tienen más de 100 registros únicos
get_corr_matrix: para visualizar la matriz de correlación para variables numéricas
get_deviation_of_mean_perc: crea un rango de valores alrededor de la media con la desviación típica y encuentra la proporción de valores dentro y fuera de este intervalo, para así poder ver los valores atípicos y la relación que tienen con la variable TARGET.
get_percent_values_target: es para analizar la relación que hay entre valores nulos y la variable TARGET.
cramers_v: para ver la relación entre variables categóricas usando la estadística de Cramer
LECTURA DE DATOS DEL PREPROCESADO INICIAL
path_folder = "../data/"
pd_loan = pd.read_csv(path_folder + "application_data.csv", low_memory = False).set_index('SK_ID_CURR')
pd_loan.head()
| TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Utilizamos el ID de cliente como el índice del Dataframe. A continuación, identificamos las variables continuas y categóricas dentro del conjunto de datos
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
- Descripción:
Esta función recibe un DataFrame una lista de las variables categóricas
(con pocos valores únicos).
- Inputs:
- dataset (DataFrame): DataFrame que contiene los datos de entrada.
- Return:
- lista_variables_categoricas (list): Lista con los nombres de las variables
categóricas en el DataFrame.
- other (list): Lista con los nombres de las variables que no cumplen los criterios
para ser categóricas.
- 1: Indica que la ejecución es incorrecta debido a la falta del
argumento 'dataset'.
'''
# Verificar que el DataFrame de entrada no sea nulo
if dataset is None:
print(u'\nError: Falta el argumento dataset en la función')
return 1
lista_variables_categoricas = []
other = []
# Recorrer las columnas del DataFrame
for i in dataset.columns:
# Si la columna es de tipo objeto (posiblemente categórica)
if dataset[i].dtype == object:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
# Si la columna es de tipo entero
if dataset[i].dtype == int:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 10:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
import numpy as np
#Aquí identificamos las variables categóricas
list_var_cat, other = dame_variables_categoricas(dataset=pd_loan)
pd_loan[list_var_cat] = pd_loan[list_var_cat].astype("category")
#Seleccionamos las columnas que tienen datos numéricos continuos
list_var_continuous = list(pd_loan.select_dtypes('float').columns)
pd_loan[list_var_continuous] = pd_loan[list_var_continuous].astype(float)
pd_loan.dtypes
TARGET int64 NAME_CONTRACT_TYPE category CODE_GENDER category FLAG_OWN_CAR category FLAG_OWN_REALTY category CNT_CHILDREN int64 AMT_INCOME_TOTAL float64 AMT_CREDIT float64 AMT_ANNUITY float64 AMT_GOODS_PRICE float64 NAME_TYPE_SUITE category NAME_INCOME_TYPE category NAME_EDUCATION_TYPE category NAME_FAMILY_STATUS category NAME_HOUSING_TYPE category REGION_POPULATION_RELATIVE float64 DAYS_BIRTH int64 DAYS_EMPLOYED int64 DAYS_REGISTRATION float64 DAYS_ID_PUBLISH int64 OWN_CAR_AGE float64 FLAG_MOBIL int64 FLAG_EMP_PHONE int64 FLAG_WORK_PHONE int64 FLAG_CONT_MOBILE int64 FLAG_PHONE int64 FLAG_EMAIL int64 OCCUPATION_TYPE category CNT_FAM_MEMBERS float64 REGION_RATING_CLIENT int64 REGION_RATING_CLIENT_W_CITY int64 WEEKDAY_APPR_PROCESS_START category HOUR_APPR_PROCESS_START int64 REG_REGION_NOT_LIVE_REGION int64 REG_REGION_NOT_WORK_REGION int64 LIVE_REGION_NOT_WORK_REGION int64 REG_CITY_NOT_LIVE_CITY int64 REG_CITY_NOT_WORK_CITY int64 LIVE_CITY_NOT_WORK_CITY int64 ORGANIZATION_TYPE category EXT_SOURCE_1 float64 EXT_SOURCE_2 float64 EXT_SOURCE_3 float64 APARTMENTS_AVG float64 BASEMENTAREA_AVG float64 YEARS_BEGINEXPLUATATION_AVG float64 YEARS_BUILD_AVG float64 COMMONAREA_AVG float64 ELEVATORS_AVG float64 ENTRANCES_AVG float64 FLOORSMAX_AVG float64 FLOORSMIN_AVG float64 LANDAREA_AVG float64 LIVINGAPARTMENTS_AVG float64 LIVINGAREA_AVG float64 NONLIVINGAPARTMENTS_AVG float64 NONLIVINGAREA_AVG float64 APARTMENTS_MODE float64 BASEMENTAREA_MODE float64 YEARS_BEGINEXPLUATATION_MODE float64 YEARS_BUILD_MODE float64 COMMONAREA_MODE float64 ELEVATORS_MODE float64 ENTRANCES_MODE float64 FLOORSMAX_MODE float64 FLOORSMIN_MODE float64 LANDAREA_MODE float64 LIVINGAPARTMENTS_MODE float64 LIVINGAREA_MODE float64 NONLIVINGAPARTMENTS_MODE float64 NONLIVINGAREA_MODE float64 APARTMENTS_MEDI float64 BASEMENTAREA_MEDI float64 YEARS_BEGINEXPLUATATION_MEDI float64 YEARS_BUILD_MEDI float64 COMMONAREA_MEDI float64 ELEVATORS_MEDI float64 ENTRANCES_MEDI float64 FLOORSMAX_MEDI float64 FLOORSMIN_MEDI float64 LANDAREA_MEDI float64 LIVINGAPARTMENTS_MEDI float64 LIVINGAREA_MEDI float64 NONLIVINGAPARTMENTS_MEDI float64 NONLIVINGAREA_MEDI float64 FONDKAPREMONT_MODE category HOUSETYPE_MODE category TOTALAREA_MODE float64 WALLSMATERIAL_MODE category EMERGENCYSTATE_MODE category OBS_30_CNT_SOCIAL_CIRCLE float64 DEF_30_CNT_SOCIAL_CIRCLE float64 OBS_60_CNT_SOCIAL_CIRCLE float64 DEF_60_CNT_SOCIAL_CIRCLE float64 DAYS_LAST_PHONE_CHANGE float64 FLAG_DOCUMENT_2 int64 FLAG_DOCUMENT_3 int64 FLAG_DOCUMENT_4 int64 FLAG_DOCUMENT_5 int64 FLAG_DOCUMENT_6 int64 FLAG_DOCUMENT_7 int64 FLAG_DOCUMENT_8 int64 FLAG_DOCUMENT_9 int64 FLAG_DOCUMENT_10 int64 FLAG_DOCUMENT_11 int64 FLAG_DOCUMENT_12 int64 FLAG_DOCUMENT_13 int64 FLAG_DOCUMENT_14 int64 FLAG_DOCUMENT_15 int64 FLAG_DOCUMENT_16 int64 FLAG_DOCUMENT_17 int64 FLAG_DOCUMENT_18 int64 FLAG_DOCUMENT_19 int64 FLAG_DOCUMENT_20 int64 FLAG_DOCUMENT_21 int64 AMT_REQ_CREDIT_BUREAU_HOUR float64 AMT_REQ_CREDIT_BUREAU_DAY float64 AMT_REQ_CREDIT_BUREAU_WEEK float64 AMT_REQ_CREDIT_BUREAU_MON float64 AMT_REQ_CREDIT_BUREAU_QRT float64 AMT_REQ_CREDIT_BUREAU_YEAR float64 dtype: object
Algunas variables clasificadas inicialmente como enteras son de carácter booleano, lo que las hace categóricas.
SEPARACIÓN EN TRAIN Y TEST ESTRATIFICADO
pd_plot_target = pd_loan['TARGET'].value_counts(normalize=False).reset_index()
pd_plot_target.columns = ['TARGET', 'count']
fig = px.bar(pd_plot_target, x='TARGET', y='count')
fig.update_xaxes(tickvals=[0, 1])
fig.show()
Se observa que el muestreo está desbalanceado, ya que la mayoría de los clientes no tienen dificultades para pagar el préstamo. (0 = pagado y 1 = no pagado) El 20% de los datos se asignan al conjunto de prueba y el 80% al conjunto de entrenamiento para entrenar el modelo.
from sklearn.model_selection import train_test_split
X_pd_loan, X_pd_loan_test, y_pd_loan, y_pd_loan_test = train_test_split(pd_loan.drop('TARGET',axis=1),
pd_loan['TARGET'],
stratify=pd_loan['TARGET'],
test_size=0.2)
pd_loan_train = pd.concat([X_pd_loan, y_pd_loan],axis=1)
pd_loan_test = pd.concat([X_pd_loan_test, y_pd_loan_test],axis=1)
print('== Train\n', pd_loan_train['TARGET'].value_counts(normalize=True))
print('== Test\n', pd_loan_test['TARGET'].value_counts(normalize=True))
== Train TARGET 0 0.919271 1 0.080729 Name: proportion, dtype: float64 == Test TARGET 0 0.919272 1 0.080728 Name: proportion, dtype: float64
Como observamos en el resultado, cada categoría de la variable target tiene la misma proporción tanto en el train como en el test, lo que permite que el modelo tenga ejemplos de todas las clases en ambos conjuntos y asi evita que el modelo tenga un sesgo hacia una clase en particular.
VISUALIZACIÓN DESCRIPTIVA DE LOS DATOS
Aquí vamos a realizar la suma de la cantidad de valores nulos que hay por filas y por columnas.
pd_null_columnas contiene el número de valores nulos por columna y el % de valores nulos respecto al total de filas. pd_null_filas contiene el número de valores nulos por filas y el porcentaje de valores nulos respecto al total de columnas, y la relación con la variable TARGET
#Veo el número de valores nulos por filas y columnas
pd_series_null_columns = pd_loan_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = pd_loan_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)
pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])
pd_null_filas['TARGET'] = pd_loan['TARGET'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/pd_loan_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/pd_loan_train.shape[1]
(121,) (246008,)
pd_null_columnas.head()
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| COMMONAREA_MODE | 171857 | 0.698583 |
| COMMONAREA_MEDI | 171857 | 0.698583 |
| COMMONAREA_AVG | 171857 | 0.698583 |
| NONLIVINGAPARTMENTS_MEDI | 170757 | 0.694112 |
| NONLIVINGAPARTMENTS_MODE | 170757 | 0.694112 |
pd_null_filas.head()
| nulos_filas | TARGET | porcentaje_filas | |
|---|---|---|---|
| SK_ID_CURR | |||
| 255145 | 61 | 0 | 0.504132 |
| 412671 | 61 | 0 | 0.504132 |
| 274127 | 61 | 0 | 0.504132 |
| 235599 | 61 | 0 | 0.504132 |
| 329262 | 61 | 0 | 0.504132 |
Hemos hecho un conteo y porcentaje de valores nulos por filas y columnas para identificar qué columnas y filas tienen más valores nulos por si hubiese que tomar la decisión de imputarlas o eliminarlas
list_var_cat
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
#Realizamos un bucle para las variables categóricas
for variable in list_var_cat:
if variable != "TARGET":
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Gráfico de conteo
sns.countplot(data=pd_loan_train, x=variable, ax=axes[0])
axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
axes[0].set_ylabel('Count')
# Rotar etiquetas del eje X
axes[0].tick_params(axis='x', rotation=45)
# Para cada categoría: gráfico de fracciones
pd_loan_df = pd_loan_train.groupby([variable, 'TARGET']).size().reset_index(name='count')
pd_loan_df['fraction'] = pd_loan_df['count'] / pd_loan_df.groupby(variable)['count'].transform('sum')
sns.barplot(data=pd_loan_df, x=variable, y='fraction', hue='TARGET', ax=axes[1])
axes[1].set_title(f'{variable} vs TARGET')
axes[1].set_ylabel('TARGET fraction')
# Rotar etiquetas del eje X
axes[1].tick_params(axis='x', rotation=45)
# Ajustar diseño
plt.tight_layout()
plt.show()
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
A partir de estos gráficos, podemos ver a la izquierda el número de valores nulos para cada categoría de las variables, y en el de la derecha, cómo se distribuye una variable en función de la variable TARGET. Por lo general, vemos que los datos están bastante desbalanceados, pues hay muchas variables que no vemos que tengan un impacto fuerte en el impago del préstamo. A continuación, vamos a comentar algunas variables que puedan ser significativas en el impago del préstamo.
NAME_INCOME_TYPE (Situación laboral del solicitante) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con 'Working', es decir, personas que tienen un empleo. Otras categorías presentan también una parte de los datos, como 'Pensioner' o 'Commercial Associate'. Otras categorías como 'Student' o 'Unemployed' presentan muy poca distribución de los datos, o ninguna. En el gráfico de la derecha, que muestra la fracción de incumplimiento para cada categoría, vemos que las personas desempleadas o las que están de baja por maternidad, tienen un porcentaje alto de incumplimiento. Los desempleados parece lógico, pues se asocia con la falta de un trabajo estable. Otras categorías como los pensionistas o los trabajadores presentan un grado bajo de incumplimiento, pues están asociados con un salario o pensión.
NAME_EDUCATION_TYPE(Nivel de educación) En el gráfico de la izquierda vemos que los datos se distribuyen sobre todo en clientes que tienen 'Secondary/Secondary Special', seguidos de 'Higher Education'. 'Academid degree' o 'Lower Secundary' son menos comunes.En el gráfico de la derecha se muestra las fracciones de incumplimiento para cada nivel educativo. Como vemos, secundaria, secundaria especial o secundaria baja son los niveles que más probabilidad de incumplimiento tienen, algo que puede estar relacionado por el tipo de empleo que estos clientes tienen, asociado con la baja formación académica recibida. Los clientes con un grado o educación superior presentan menor riesgo de incumplimiento, algo que también puede estar relacionado por el caso contrario al anterior.Podemos concluir que el nivel educativo influye en la capacidad de incumplimiento del solicitante.
NAME_HOUSING_TYPE(Tipo de vivienda) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con clientes que vivene un apartameto o casa, que por otra parte es lo más normal. Otras categorías presentan una distribución de los datos mucho menor, como 'Municipal Apartment', 'With Parents' o 'Rented Apartment'. En cuanto a su relación con TARGET, vemos que la fracción de incumplimineto es algo mayor para aquellos solicitantes que viven con los padres o de alquiler. Esto puede estar relacionado con un menor grado de cumplimineto, pues los que viven con sus padres es posible que no tengan una ocupación, aunque también pueden ser aún jóvenes para independizarse. Aun así, no parece una variable muy significativa en cuanto al grado de incumplimiento del préstamo.
OCCUPATYON_TYPE (Tipo de trabajo) Al observar la variable, vemos que presenta una buena cantidad de nulos que podríamos imputar para mejorar el análisis. En el gráfico de la izquierda, vemos que la mayoría de los solicitantes pertenecen a 'Laborers'(en mayor proporción), 'Core Staff' y 'Sales Staff'. Como vemos en el gráfico de la derecha, los trabajadores dedicados a actividades manuales o físicas como la construcción, fabricación, agaricultura, así como aquellos con una baja especialización, como por ejemplo, ayudante de obra, presentan mayor incumplimineto de devolver el préstamo. Otros gremios, como los conductores o los vigilantes de seguridad presentan un grado de incumplimiento menor, aunque mayor que el resto. Esto puede estar debido a la formación educativa, ya que estos trabajos no requieren de una cualificación para dedicarse a ellos. Podemos concluir que el tipo de ocupación puede influir en el grado de incumplimiento del préstamo.
CODE_GENDER(Género) Como vemos en el gráfico de la izquierda, hay mayor proporción de clientes femeninos que masculinos.En el gráfico de la derecha no parece haber una diferencia extremadamente significativa entre el grado de incumplimineto de hombres y mujeres, aunque si nos fijamos el grado de incumplimiento mayor es ligeramente masculino. Esto puede deberse a muchas razones como el tipo de trabajo, mentalidad o tipo de educación, entre otras. En conclusión, esta variable tiene una relación moderada con la probabilidad de incumplkimiento, siendo importante considerar cómoa actúa con otras variables.
NAME_CONTRACT_TYPE (Tipo de contrato) En el gráfico de la izquierda vemos que la mayoría de los préstamos pertenecen a 'Cash Loans', mientras que los 'Revolving Loans' tienen una proproción mucho menor. Con respecto a la variable TARGET, vemos que no hay una gran diferencia en la probabilidad de incumplimiento entre ambos préstamos, pero sí que es verdad que los 'Cash Loans' tienen una probabilidad mayor, cosa que tiene sentido al ser mucho más comunes en el día a día.
for variable in list_var_continuous: # Lista de variables continuas
if variable != "TARGET":
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Histograma
sns.histplot(data=pd_loan_train, x=variable, kde=True, ax=axes[0], bins=30)
axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
axes[0].set_ylabel('Frequency')
# Boxplot vs TARGET
sns.boxplot(data=pd_loan_train, x='TARGET', y=variable, ax=axes[1])
axes[1].set_title(f'{variable} vs TARGET')
axes[1].set_ylabel(variable)
axes[1].set_xlabel('TARGET')
# Ajustar diseño
plt.tight_layout()
plt.show()
Una vez realizados los gráficos de las variables continuas con la variable objetivo (TARGET), procederemos a realizar algunos comentarios de algunas variables que puedan influir en la misma, así como algunas que no sean tan significativas para tener un ejemplo de todo. A continuación, veremos cuáles son las variables elegidas.
OWN_CAR_AGE (Edad del coche propio). Como vemos en el gráfico de la izquierda, hay un número considerable de valores nulos, concretamente 162.426, lo que indica que muchas personas no poseen automóvil o no hay información a cerca de ellos. Esto se debería gestionar mediante, por ejemplo, con la imputación o creando una categoría específica para los que no tienen automóvil. La mayoría de los coches tienen entre 0 y 20 años, con una concentración importante sobre todo de 0 a 10. No hay casi valores más allá de los 40 años, solo unos pocos datos sobrepasan los 60 años. En cuanto a su relación con TARGET, vemos en los boxplots que no hay grandes diferencias en la mediana ni en el rango intercuartílico para 0 y para 1, lo que nos puede indicar que no el coche propio puede no ser un factor influyente en cuanto a devolver o no el crédito.
EXT_SOURCE_1 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas). En el gráfico de la izquierda se puede observar que la variable sigue una distribución simétrica con forma similar a una distribución normal. Esta variable tiene una elevada cantidad de valores lo que indica que puede ser útil para discriminar entre distintos tipos de solicitantes. En cuanto a su relación con TARGET, observamos que el boxplot muestra que la mediana para los clientes que no incumplen (TARGET=0) es algo mayor que la de los que sí incumplen (TARGET=1) tienden a tener valores más altos, lo que puede decir que esta variable está asociada con un menor riesgo de incumplimiento.El rango intercuartílico es algo mayor también en los clientes cumplidores que en los que no. Podemos decir que esta variable puede tener un cierto poder predictivo.Los valores altos de EXT_SOURCE_1 se asocian con un menor riesgo de incumplimiento.
CNT_FAM_MEMBERS (Numero de miembros de la familia). En el gráfico de la izquierda vemos que la variable posee 2 valores nulos, algo que no es significativo dentro del total de datos. En el histograma se muestra que la mayoría de los datos se encuentran entre 2 y 4 miembros, concentrándose sobre todo en 2, sugiriendo que una gran parte de las familias son pequeñas. En relación con TARGET, el boxplot muestra que la distribución es similiar tanto para los cumplidores como para los que no. Tanto la mediana como el rango intercuartílico son iguales, lo que indica que no hay una gran relación con la probabilidad de impago. Hay algunas familias con un número elevado de miembros, aunque no parece que tengan un efecto significativo.
AMT_INCOME_TOTAL (Ingresos totales). Esta variable no posee valores nulos, por lo que podría ser representativa. En cuanto al histograma, la distribución de la variable está sesgada hacia la izquierda, lo que indica que la mayoría de clientes tienen ingresos bajos. La distribuación de los ingresos para TARGET=0 y TARGET=1 es muy similar, La mediana y el rango intercuartílico son también casi iguales, lo que indica que el nivel de ingresos no tiene una relación directa con la posibilidad de impago.
EXT_SOURCE_2 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) En este caso, presenta pocos valores nulos con respecto al total de los datos, aunque se podrían imputar para no perder demasiada información. En el histograma vemos que la mayoría de los valores están concentrados entre 0.4 y 0.8, es decir, sesgados a la derecha, mientras que los valores extremos son menos comunes.En cuanto a su relación con TARGET, vemos que los valores más altos están relacionados con los clientes que no incumplen. La mediana es mas alta para los clientes que no incumplen, lo que un valor más alto de la variable podría estar asociado con un menor riesgo de incumplimiento. El rango intercuartílico es bastante parecido, aunque se observa que los valores bajos se concentran más en los clientes que presentan incumplimiento, por lo que podemos decir que esta variable puede ser un buen predictor.
EXT_SOURCE_3 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) Al igual que en las variables anteriores como EXT_SOURCE_1 y EXT_SOURCE_2, prsenta los valores más altos asocidados a clientes que no incumplen el préstamo, lo que indica que valores bajos poseen un mayor riesgo de incumplimiento. Por lo tanto, podemos concluir que la información externa sobre clientes solicitantes de préstamo es una buena variable predictora.
TRATAMIENTO DE VARIABLES CONTINUAS
A continuación se tratarán tres aspectos clave del análisis de los datos: los valores missing, los outliers y las correlaciones entre las variables continuas
list_var_continuous
['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
'''
----------------------------------------------------------------------------------------------------------
Función get_deviation_of_mean_perc:
----------------------------------------------------------------------------------------------------------
- Descripción:
Esta función calcula el porcentaje de valores que se encuentran fuera de un
intervalo de confianza, determinado por la media y una desviación estándar
multiplicada por un factor (multiplier), para cada variable continua en el
DataFrame. Luego, analiza la relación entre estos valores atípicos y la variable
objetivo (TARGET), y devuelve un resumen con los porcentajes de valores atípicos
y su distribución en relación con la variable objetivo.
- Inputs:
- pd_loan (DataFrame): DataFrame que contiene los datos de entrada.
- list_var_continuous (list): Lista con los nombres de las variables continuas
a analizar.
- target (str): Nombre de la variable objetivo en el DataFrame.
- multiplier (float): Factor multiplicador para calcular el intervalo de confianza
(desviación estándar).
- Return:
- pd_final (DataFrame): DataFrame que contiene el porcentaje de valores atípicos
por cada variable continua, su distribución con respecto a la variable objetivo
(TARGET), y otros detalles relevantes.
'''
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
get_deviation_of_mean_perc(pd_loan_train, list_var_continuous, target='TARGET', multiplier=3)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.000000 | 1.000000 | AMT_INCOME_TOTAL | 215 | 0.000874 |
| 1 | 0.948837 | 0.051163 | AMT_INCOME_TOTAL | 215 | 0.000874 |
| 2 | 0.000000 | 1.000000 | AMT_CREDIT | 2638 | 0.010723 |
| 3 | 0.958681 | 0.041319 | AMT_CREDIT | 2638 | 0.010723 |
| 4 | 0.000000 | 1.000000 | AMT_ANNUITY | 2354 | 0.009569 |
| 5 | 0.964741 | 0.035259 | AMT_ANNUITY | 2354 | 0.009569 |
| 6 | 0.000000 | 1.000000 | AMT_GOODS_PRICE | 3371 | 0.013703 |
| 7 | 0.961732 | 0.038268 | AMT_GOODS_PRICE | 3371 | 0.013703 |
| 8 | 0.000000 | 1.000000 | REGION_POPULATION_RELATIVE | 6686 | 0.027178 |
| 9 | 0.959168 | 0.040832 | REGION_POPULATION_RELATIVE | 6686 | 0.027178 |
| 10 | 0.000000 | 1.000000 | DAYS_REGISTRATION | 605 | 0.002459 |
| 11 | 0.958678 | 0.041322 | DAYS_REGISTRATION | 605 | 0.002459 |
| 12 | 0.000000 | 1.000000 | OWN_CAR_AGE | 2677 | 0.010882 |
| 13 | 0.917445 | 0.082555 | OWN_CAR_AGE | 2677 | 0.010882 |
| 14 | 0.000000 | 1.000000 | CNT_FAM_MEMBERS | 3223 | 0.013101 |
| 15 | 0.900093 | 0.099907 | CNT_FAM_MEMBERS | 3223 | 0.013101 |
| 16 | 0.000000 | 1.000000 | APARTMENTS_AVG | 2387 | 0.009703 |
| 17 | 0.948890 | 0.051110 | APARTMENTS_AVG | 2387 | 0.009703 |
| 18 | 0.000000 | 1.000000 | BASEMENTAREA_AVG | 1588 | 0.006455 |
| 19 | 0.947733 | 0.052267 | BASEMENTAREA_AVG | 1588 | 0.006455 |
| 20 | 0.000000 | 1.000000 | YEARS_BEGINEXPLUATATION_AVG | 552 | 0.002244 |
| 21 | 0.902174 | 0.097826 | YEARS_BEGINEXPLUATATION_AVG | 552 | 0.002244 |
| 22 | 0.000000 | 1.000000 | YEARS_BUILD_AVG | 964 | 0.003919 |
| 23 | 0.930498 | 0.069502 | YEARS_BUILD_AVG | 964 | 0.003919 |
| 24 | 0.000000 | 1.000000 | COMMONAREA_AVG | 1371 | 0.005573 |
| 25 | 0.951131 | 0.048869 | COMMONAREA_AVG | 1371 | 0.005573 |
| 26 | 0.000000 | 1.000000 | ELEVATORS_AVG | 1941 | 0.007890 |
| 27 | 0.954663 | 0.045337 | ELEVATORS_AVG | 1941 | 0.007890 |
| 28 | 0.000000 | 1.000000 | ENTRANCES_AVG | 1759 | 0.007150 |
| 29 | 0.938033 | 0.061967 | ENTRANCES_AVG | 1759 | 0.007150 |
| 30 | 0.000000 | 1.000000 | FLOORSMAX_AVG | 2056 | 0.008357 |
| 31 | 0.957198 | 0.042802 | FLOORSMAX_AVG | 2056 | 0.008357 |
| 32 | 0.000000 | 1.000000 | FLOORSMIN_AVG | 474 | 0.001927 |
| 33 | 0.955696 | 0.044304 | FLOORSMIN_AVG | 474 | 0.001927 |
| 34 | 0.000000 | 1.000000 | LANDAREA_AVG | 1696 | 0.006894 |
| 35 | 0.938679 | 0.061321 | LANDAREA_AVG | 1696 | 0.006894 |
| 36 | 0.000000 | 1.000000 | LIVINGAPARTMENTS_AVG | 1393 | 0.005662 |
| 37 | 0.952620 | 0.047380 | LIVINGAPARTMENTS_AVG | 1393 | 0.005662 |
| 38 | 0.000000 | 1.000000 | LIVINGAREA_AVG | 2568 | 0.010439 |
| 39 | 0.948988 | 0.051012 | LIVINGAREA_AVG | 2568 | 0.010439 |
| 40 | 0.000000 | 1.000000 | NONLIVINGAPARTMENTS_AVG | 586 | 0.002382 |
| 41 | 0.930034 | 0.069966 | NONLIVINGAPARTMENTS_AVG | 586 | 0.002382 |
| 42 | 0.000000 | 1.000000 | NONLIVINGAREA_AVG | 1956 | 0.007951 |
| 43 | 0.947342 | 0.052658 | NONLIVINGAREA_AVG | 1956 | 0.007951 |
| 44 | 0.000000 | 1.000000 | APARTMENTS_MODE | 2411 | 0.009800 |
| 45 | 0.948569 | 0.051431 | APARTMENTS_MODE | 2411 | 0.009800 |
| 46 | 0.000000 | 1.000000 | BASEMENTAREA_MODE | 1689 | 0.006866 |
| 47 | 0.944938 | 0.055062 | BASEMENTAREA_MODE | 1689 | 0.006866 |
| 48 | 0.000000 | 1.000000 | YEARS_BEGINEXPLUATATION_MODE | 540 | 0.002195 |
| 49 | 0.900000 | 0.100000 | YEARS_BEGINEXPLUATATION_MODE | 540 | 0.002195 |
| 50 | 0.000000 | 1.000000 | YEARS_BUILD_MODE | 975 | 0.003963 |
| 51 | 0.931282 | 0.068718 | YEARS_BUILD_MODE | 975 | 0.003963 |
| 52 | 0.000000 | 1.000000 | COMMONAREA_MODE | 1359 | 0.005524 |
| 53 | 0.946284 | 0.053716 | COMMONAREA_MODE | 1359 | 0.005524 |
| 54 | 0.000000 | 1.000000 | ELEVATORS_MODE | 2679 | 0.010890 |
| 55 | 0.951101 | 0.048899 | ELEVATORS_MODE | 2679 | 0.010890 |
| 56 | 0.000000 | 1.000000 | ENTRANCES_MODE | 2094 | 0.008512 |
| 57 | 0.941261 | 0.058739 | ENTRANCES_MODE | 2094 | 0.008512 |
| 58 | 0.000000 | 1.000000 | FLOORSMAX_MODE | 2085 | 0.008475 |
| 59 | 0.958753 | 0.041247 | FLOORSMAX_MODE | 2085 | 0.008475 |
| 60 | 0.000000 | 1.000000 | FLOORSMIN_MODE | 382 | 0.001553 |
| 61 | 0.958115 | 0.041885 | FLOORSMIN_MODE | 382 | 0.001553 |
| 62 | 0.000000 | 1.000000 | LANDAREA_MODE | 1714 | 0.006967 |
| 63 | 0.937573 | 0.062427 | LANDAREA_MODE | 1714 | 0.006967 |
| 64 | 0.000000 | 1.000000 | LIVINGAPARTMENTS_MODE | 1438 | 0.005845 |
| 65 | 0.949235 | 0.050765 | LIVINGAPARTMENTS_MODE | 1438 | 0.005845 |
| 66 | 0.000000 | 1.000000 | LIVINGAREA_MODE | 2687 | 0.010922 |
| 67 | 0.947897 | 0.052103 | LIVINGAREA_MODE | 2687 | 0.010922 |
| 68 | 0.000000 | 1.000000 | NONLIVINGAPARTMENTS_MODE | 546 | 0.002219 |
| 69 | 0.921245 | 0.078755 | NONLIVINGAPARTMENTS_MODE | 546 | 0.002219 |
| 70 | 0.000000 | 1.000000 | NONLIVINGAREA_MODE | 1994 | 0.008105 |
| 71 | 0.948345 | 0.051655 | NONLIVINGAREA_MODE | 1994 | 0.008105 |
| 72 | 0.000000 | 1.000000 | APARTMENTS_MEDI | 2428 | 0.009870 |
| 73 | 0.948929 | 0.051071 | APARTMENTS_MEDI | 2428 | 0.009870 |
| 74 | 0.000000 | 1.000000 | BASEMENTAREA_MEDI | 1621 | 0.006589 |
| 75 | 0.948180 | 0.051820 | BASEMENTAREA_MEDI | 1621 | 0.006589 |
| 76 | 0.000000 | 1.000000 | YEARS_BEGINEXPLUATATION_MEDI | 515 | 0.002093 |
| 77 | 0.897087 | 0.102913 | YEARS_BEGINEXPLUATATION_MEDI | 515 | 0.002093 |
| 78 | 0.000000 | 1.000000 | YEARS_BUILD_MEDI | 976 | 0.003967 |
| 79 | 0.931352 | 0.068648 | YEARS_BUILD_MEDI | 976 | 0.003967 |
| 80 | 0.000000 | 1.000000 | COMMONAREA_MEDI | 1393 | 0.005662 |
| 81 | 0.949749 | 0.050251 | COMMONAREA_MEDI | 1393 | 0.005662 |
| 82 | 0.000000 | 1.000000 | ELEVATORS_MEDI | 1930 | 0.007845 |
| 83 | 0.954922 | 0.045078 | ELEVATORS_MEDI | 1930 | 0.007845 |
| 84 | 0.000000 | 1.000000 | ENTRANCES_MEDI | 1769 | 0.007191 |
| 85 | 0.937818 | 0.062182 | ENTRANCES_MEDI | 1769 | 0.007191 |
| 86 | 0.000000 | 1.000000 | FLOORSMAX_MEDI | 2166 | 0.008805 |
| 87 | 0.957525 | 0.042475 | FLOORSMAX_MEDI | 2166 | 0.008805 |
| 88 | 0.000000 | 1.000000 | FLOORSMIN_MEDI | 443 | 0.001801 |
| 89 | 0.957111 | 0.042889 | FLOORSMIN_MEDI | 443 | 0.001801 |
| 90 | 0.000000 | 1.000000 | LANDAREA_MEDI | 1717 | 0.006979 |
| 91 | 0.940594 | 0.059406 | LANDAREA_MEDI | 1717 | 0.006979 |
| 92 | 0.000000 | 1.000000 | LIVINGAPARTMENTS_MEDI | 1411 | 0.005736 |
| 93 | 0.951807 | 0.048193 | LIVINGAPARTMENTS_MEDI | 1411 | 0.005736 |
| 94 | 0.000000 | 1.000000 | LIVINGAREA_MEDI | 2581 | 0.010492 |
| 95 | 0.950794 | 0.049206 | LIVINGAREA_MEDI | 2581 | 0.010492 |
| 96 | 0.000000 | 1.000000 | NONLIVINGAPARTMENTS_MEDI | 584 | 0.002374 |
| 97 | 0.928082 | 0.071918 | NONLIVINGAPARTMENTS_MEDI | 584 | 0.002374 |
| 98 | 0.000000 | 1.000000 | NONLIVINGAREA_MEDI | 1977 | 0.008036 |
| 99 | 0.948407 | 0.051593 | NONLIVINGAREA_MEDI | 1977 | 0.008036 |
| 100 | 0.000000 | 1.000000 | TOTALAREA_MODE | 2655 | 0.010792 |
| 101 | 0.958192 | 0.041808 | TOTALAREA_MODE | 2655 | 0.010792 |
| 102 | 0.000000 | 1.000000 | OBS_30_CNT_SOCIAL_CIRCLE | 4932 | 0.020048 |
| 103 | 0.912003 | 0.087997 | OBS_30_CNT_SOCIAL_CIRCLE | 4932 | 0.020048 |
| 104 | 0.000000 | 1.000000 | DEF_30_CNT_SOCIAL_CIRCLE | 5516 | 0.022422 |
| 105 | 0.883249 | 0.116751 | DEF_30_CNT_SOCIAL_CIRCLE | 5516 | 0.022422 |
| 106 | 0.000000 | 1.000000 | OBS_60_CNT_SOCIAL_CIRCLE | 4787 | 0.019459 |
| 107 | 0.912053 | 0.087947 | OBS_60_CNT_SOCIAL_CIRCLE | 4787 | 0.019459 |
| 108 | 0.000000 | 1.000000 | DEF_60_CNT_SOCIAL_CIRCLE | 3174 | 0.012902 |
| 109 | 0.875551 | 0.124449 | DEF_60_CNT_SOCIAL_CIRCLE | 3174 | 0.012902 |
| 110 | 0.000000 | 1.000000 | DAYS_LAST_PHONE_CHANGE | 511 | 0.002077 |
| 111 | 0.958904 | 0.041096 | DAYS_LAST_PHONE_CHANGE | 511 | 0.002077 |
| 112 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_HOUR | 1277 | 0.005191 |
| 113 | 0.924041 | 0.075959 | AMT_REQ_CREDIT_BUREAU_HOUR | 1277 | 0.005191 |
| 114 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_DAY | 1176 | 0.004780 |
| 115 | 0.909014 | 0.090986 | AMT_REQ_CREDIT_BUREAU_DAY | 1176 | 0.004780 |
| 116 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_WEEK | 6754 | 0.027454 |
| 117 | 0.922564 | 0.077436 | AMT_REQ_CREDIT_BUREAU_WEEK | 6754 | 0.027454 |
| 118 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_MON | 2580 | 0.010487 |
| 119 | 0.946512 | 0.053488 | AMT_REQ_CREDIT_BUREAU_MON | 2580 | 0.010487 |
| 120 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_QRT | 1842 | 0.007488 |
| 121 | 0.915852 | 0.084148 | AMT_REQ_CREDIT_BUREAU_QRT | 1842 | 0.007488 |
| 122 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_YEAR | 2713 | 0.011028 |
| 123 | 0.908220 | 0.091780 | AMT_REQ_CREDIT_BUREAU_YEAR | 2713 | 0.011028 |
CORRELACIONES
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
''''
----------------------------------------------------------------------------------------------------------
Función get_corr_matrix:
----------------------------------------------------------------------------------------------------------
- Descripción:
Esta función calcula y visualiza la matriz de correlación entre las variables
numéricas de un conjunto de datos.
- Inputs:
- dataset (DataFrame): Conjunto de datos con las variables numéricas a analizar.
- metodo (str): Método de correlación a utilizar.
- size_figure (list): Tamaño de la figura del gráfico.
- Return:
- None: Muestra un mapa de calor de la matriz de correlación.
----------------------------------------------------------------------------------------------------------
'''
# Comprobación de que se ha proporcionado el dataset
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Calcular la matriz de correlación
corr = dataset.corr(method=metodo)
# Establecer la autocorrelación a cero para evitar distracciones
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
f, ax = plt.subplots(figsize=size_figure)
# Dibujar el mapa de calor con la correlación
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
get_corr_matrix(dataset = pd_loan_train[list_var_continuous],
metodo='pearson', size_figure=[10,8])
0
En la matriz de correlación se observan varias correlaciones de Pearson cercanas o iguales a 1, lo que indica que algunas variables están perfectamente correlacionadas. Esto puede deberse a la existencia de variables redundantes o derivadas de otras, lo que podría generar problemas de multicolinealidad en modelos estadísticos. Es recomendable revisar estas variables y, en caso de ser necesario, eliminar las que sean idénticas o altamente correlacionadas para evitar distorsiones en los resultados del modelo.
corr = pd_loan_train[list_var_continuous].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 3694 | OBS_60_CNT_SOCIAL_CIRCLE | OBS_30_CNT_SOCIAL_CIRCLE | 0.998498 |
| 2744 | YEARS_BUILD_MEDI | YEARS_BUILD_AVG | 0.998480 |
| 3074 | FLOORSMIN_MEDI | FLOORSMIN_AVG | 0.997218 |
| 3008 | FLOORSMAX_MEDI | FLOORSMAX_AVG | 0.996977 |
| 2942 | ENTRANCES_MEDI | ENTRANCES_AVG | 0.996884 |
| 2876 | ELEVATORS_MEDI | ELEVATORS_AVG | 0.996026 |
| 2810 | COMMONAREA_MEDI | COMMONAREA_AVG | 0.995566 |
| 3272 | LIVINGAREA_MEDI | LIVINGAREA_AVG | 0.995388 |
| 2612 | BASEMENTAREA_MEDI | BASEMENTAREA_AVG | 0.994972 |
| 2546 | APARTMENTS_MEDI | APARTMENTS_AVG | 0.994890 |
| 3206 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.993515 |
| 2678 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_AVG | 0.993382 |
| 3140 | LANDAREA_MEDI | LANDAREA_AVG | 0.990689 |
| 3404 | NONLIVINGAREA_MEDI | NONLIVINGAREA_AVG | 0.989493 |
| 2758 | YEARS_BUILD_MEDI | YEARS_BUILD_MODE | 0.989490 |
| 1834 | YEARS_BUILD_MODE | YEARS_BUILD_AVG | 0.989383 |
| 3338 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_AVG | 0.988915 |
| 3088 | FLOORSMIN_MEDI | FLOORSMIN_MODE | 0.988121 |
| 3022 | FLOORSMAX_MEDI | FLOORSMAX_MODE | 0.987859 |
| 196 | AMT_GOODS_PRICE | AMT_CREDIT | 0.987028 |
| 2164 | FLOORSMIN_MODE | FLOORSMIN_AVG | 0.985501 |
| 2098 | FLOORSMAX_MODE | FLOORSMAX_AVG | 0.985307 |
| 2890 | ELEVATORS_MEDI | ELEVATORS_MODE | 0.982507 |
| 2956 | ENTRANCES_MEDI | ENTRANCES_MODE | 0.980636 |
| 3154 | LANDAREA_MEDI | LANDAREA_MODE | 0.980086 |
| 2824 | COMMONAREA_MEDI | COMMONAREA_MODE | 0.978585 |
| 1966 | ELEVATORS_MODE | ELEVATORS_AVG | 0.978504 |
| 2032 | ENTRANCES_MODE | ENTRANCES_AVG | 0.977752 |
| 2626 | BASEMENTAREA_MEDI | BASEMENTAREA_MODE | 0.977507 |
| 2560 | APARTMENTS_MEDI | APARTMENTS_MODE | 0.976882 |
| 3418 | NONLIVINGAREA_MEDI | NONLIVINGAREA_MODE | 0.975851 |
| 3352 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_MODE | 0.975829 |
| 1900 | COMMONAREA_MODE | COMMONAREA_AVG | 0.975696 |
| 3220 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.975206 |
| 3286 | LIVINGAREA_MEDI | LIVINGAREA_MODE | 0.974256 |
| 1702 | BASEMENTAREA_MODE | BASEMENTAREA_AVG | 0.973761 |
| 1636 | APARTMENTS_MODE | APARTMENTS_AVG | 0.972757 |
| 2230 | LANDAREA_MODE | LANDAREA_AVG | 0.972270 |
| 2362 | LIVINGAREA_MODE | LIVINGAREA_AVG | 0.971495 |
| 1768 | YEARS_BEGINEXPLUATATION_MODE | YEARS_BEGINEXPLUATATION_AVG | 0.970595 |
| 2296 | LIVINGAPARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.969416 |
| 2494 | NONLIVINGAREA_MODE | NONLIVINGAREA_AVG | 0.965026 |
| 2428 | NONLIVINGAPARTMENTS_MODE | NONLIVINGAPARTMENTS_AVG | 0.964931 |
| 2692 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_MODE | 0.961011 |
| 1376 | LIVINGAPARTMENTS_AVG | APARTMENTS_AVG | 0.943873 |
| 3224 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MEDI | 0.942700 |
| 3196 | LIVINGAPARTMENTS_MEDI | APARTMENTS_AVG | 0.941100 |
| 2300 | LIVINGAPARTMENTS_MODE | APARTMENTS_MODE | 0.938834 |
| 2556 | APARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.935462 |
| 2570 | APARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.932595 |
| 2286 | LIVINGAPARTMENTS_MODE | APARTMENTS_AVG | 0.930757 |
| 3467 | TOTALAREA_MODE | LIVINGAREA_AVG | 0.925346 |
| 3495 | TOTALAREA_MODE | LIVINGAREA_MEDI | 0.919845 |
| 3289 | LIVINGAREA_MEDI | APARTMENTS_MEDI | 0.915476 |
| 3210 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MODE | 0.914985 |
| 1441 | LIVINGAREA_AVG | APARTMENTS_AVG | 0.912775 |
| 2557 | APARTMENTS_MEDI | LIVINGAREA_AVG | 0.911891 |
| 3261 | LIVINGAREA_MEDI | APARTMENTS_AVG | 0.911658 |
| 2365 | LIVINGAREA_MODE | APARTMENTS_MODE | 0.910327 |
| 1646 | APARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.908376 |
| 3481 | TOTALAREA_MODE | LIVINGAREA_MODE | 0.899464 |
| 2571 | APARTMENTS_MEDI | LIVINGAREA_MODE | 0.895374 |
| 3275 | LIVINGAREA_MEDI | APARTMENTS_MODE | 0.893746 |
| 2351 | LIVINGAREA_MODE | APARTMENTS_AVG | 0.892603 |
| 3456 | TOTALAREA_MODE | APARTMENTS_AVG | 0.891460 |
| 1647 | APARTMENTS_MODE | LIVINGAREA_AVG | 0.890110 |
| 3484 | TOTALAREA_MODE | APARTMENTS_MEDI | 0.885698 |
| 3299 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MEDI | 0.885637 |
| 3207 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_AVG | 0.883687 |
| 1451 | LIVINGAREA_AVG | LIVINGAPARTMENTS_AVG | 0.880839 |
| 2375 | LIVINGAREA_MODE | LIVINGAPARTMENTS_MODE | 0.879683 |
| 3271 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.878834 |
| 3285 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.875439 |
| 2297 | LIVINGAPARTMENTS_MODE | LIVINGAREA_AVG | 0.873638 |
| 3294 | LIVINGAREA_MEDI | ELEVATORS_MEDI | 0.868963 |
| 1446 | LIVINGAREA_AVG | ELEVATORS_AVG | 0.868139 |
| 2882 | ELEVATORS_MEDI | LIVINGAREA_AVG | 0.866215 |
| 3266 | LIVINGAREA_MEDI | ELEVATORS_AVG | 0.866211 |
| 3470 | TOTALAREA_MODE | APARTMENTS_MODE | 0.862547 |
| 3760 | DEF_60_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | 0.861292 |
| 3221 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MODE | 0.858074 |
| 2370 | LIVINGAREA_MODE | ELEVATORS_MODE | 0.856832 |
| 3280 | LIVINGAREA_MEDI | ELEVATORS_MODE | 0.856177 |
| 1972 | ELEVATORS_MODE | LIVINGAREA_AVG | 0.852708 |
| 2361 | LIVINGAREA_MODE | LIVINGAPARTMENTS_AVG | 0.852073 |
| 3466 | TOTALAREA_MODE | LIVINGAPARTMENTS_AVG | 0.848978 |
| 3494 | TOTALAREA_MODE | LIVINGAPARTMENTS_MEDI | 0.847425 |
| 3461 | TOTALAREA_MODE | ELEVATORS_AVG | 0.844127 |
| 2896 | ELEVATORS_MEDI | LIVINGAREA_MODE | 0.841342 |
| 2356 | LIVINGAREA_MODE | ELEVATORS_AVG | 0.839310 |
| 3489 | TOTALAREA_MODE | ELEVATORS_MEDI | 0.837782 |
| 2899 | ELEVATORS_MEDI | APARTMENTS_MEDI | 0.836951 |
| 1051 | ELEVATORS_AVG | APARTMENTS_AVG | 0.836207 |
| 3480 | TOTALAREA_MODE | LIVINGAPARTMENTS_MODE | 0.834900 |
| 2871 | ELEVATORS_MEDI | APARTMENTS_AVG | 0.834418 |
| 2551 | APARTMENTS_MEDI | ELEVATORS_AVG | 0.834012 |
| 1975 | ELEVATORS_MODE | APARTMENTS_MODE | 0.825476 |
| 2565 | APARTMENTS_MEDI | ELEVATORS_MODE | 0.824589 |
| 1961 | ELEVATORS_MODE | APARTMENTS_AVG | 0.821284 |
| 3475 | TOTALAREA_MODE | ELEVATORS_MODE | 0.819961 |
| 3229 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MEDI | 0.814474 |
| 3201 | LIVINGAPARTMENTS_MEDI | ELEVATORS_AVG | 0.812741 |
| 1381 | LIVINGAPARTMENTS_AVG | ELEVATORS_AVG | 0.811521 |
| 2881 | ELEVATORS_MEDI | LIVINGAPARTMENTS_AVG | 0.809250 |
| 2305 | LIVINGAPARTMENTS_MODE | ELEVATORS_MODE | 0.808508 |
| 2885 | ELEVATORS_MEDI | APARTMENTS_MODE | 0.807934 |
| 1641 | APARTMENTS_MODE | ELEVATORS_AVG | 0.805094 |
| 3215 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MODE | 0.799520 |
| 2895 | ELEVATORS_MEDI | LIVINGAPARTMENTS_MODE | 0.799019 |
| 2291 | LIVINGAPARTMENTS_MODE | ELEVATORS_AVG | 0.797236 |
| 1971 | ELEVATORS_MODE | LIVINGAPARTMENTS_AVG | 0.793922 |
| 197 | AMT_GOODS_PRICE | AMT_ANNUITY | 0.775646 |
| 131 | AMT_ANNUITY | AMT_CREDIT | 0.770755 |
| 1253 | FLOORSMIN_AVG | FLOORSMAX_AVG | 0.744612 |
| 3101 | FLOORSMIN_MEDI | FLOORSMAX_MEDI | 0.742574 |
| 3009 | FLOORSMAX_MEDI | FLOORSMIN_AVG | 0.742076 |
| 3073 | FLOORSMIN_MEDI | FLOORSMAX_AVG | 0.742022 |
| 3087 | FLOORSMIN_MEDI | FLOORSMAX_MODE | 0.731779 |
| 2099 | FLOORSMAX_MODE | FLOORSMIN_AVG | 0.731063 |
| 2177 | FLOORSMIN_MODE | FLOORSMAX_MODE | 0.728064 |
| 3023 | FLOORSMAX_MEDI | FLOORSMIN_MODE | 0.724399 |
| 2163 | FLOORSMIN_MODE | FLOORSMAX_AVG | 0.723555 |
| 1442 | LIVINGAREA_AVG | BASEMENTAREA_AVG | 0.693530 |
| 2622 | BASEMENTAREA_MEDI | LIVINGAREA_AVG | 0.692890 |
| 3290 | LIVINGAREA_MEDI | BASEMENTAREA_MEDI | 0.692285 |
| 2366 | LIVINGAREA_MODE | BASEMENTAREA_MODE | 0.691908 |
| 3262 | LIVINGAREA_MEDI | BASEMENTAREA_AVG | 0.690430 |
| 2636 | BASEMENTAREA_MEDI | LIVINGAREA_MODE | 0.681994 |
| 2639 | BASEMENTAREA_MEDI | APARTMENTS_MEDI | 0.680139 |
| 791 | BASEMENTAREA_AVG | APARTMENTS_AVG | 0.679079 |
| 2352 | LIVINGAREA_MODE | BASEMENTAREA_AVG | 0.679063 |
| 1186 | FLOORSMAX_AVG | ELEVATORS_AVG | 0.679062 |
| 2611 | BASEMENTAREA_MEDI | APARTMENTS_AVG | 0.678390 |
| 1715 | BASEMENTAREA_MODE | APARTMENTS_MODE | 0.678241 |
| 2547 | APARTMENTS_MEDI | BASEMENTAREA_AVG | 0.678062 |
| 3006 | FLOORSMAX_MEDI | ELEVATORS_AVG | 0.676727 |
| 2878 | ELEVATORS_MEDI | FLOORSMAX_AVG | 0.675519 |
| 3034 | FLOORSMAX_MEDI | ELEVATORS_MEDI | 0.675033 |
| 1712 | BASEMENTAREA_MODE | LIVINGAREA_AVG | 0.674008 |
| 3457 | TOTALAREA_MODE | BASEMENTAREA_AVG | 0.674000 |
| 3276 | LIVINGAREA_MEDI | BASEMENTAREA_MODE | 0.673957 |
| 3485 | TOTALAREA_MODE | BASEMENTAREA_MEDI | 0.670770 |
| 2096 | FLOORSMAX_MODE | ELEVATORS_AVG | 0.669606 |
| 2625 | BASEMENTAREA_MEDI | APARTMENTS_MODE | 0.668767 |
| 2892 | ELEVATORS_MEDI | FLOORSMAX_MODE | 0.668033 |
| 1637 | APARTMENTS_MODE | BASEMENTAREA_AVG | 0.666231 |
| 2561 | APARTMENTS_MEDI | BASEMENTAREA_MODE | 0.662903 |
| 1701 | BASEMENTAREA_MODE | APARTMENTS_AVG | 0.660205 |
| 2110 | FLOORSMAX_MODE | ELEVATORS_MODE | 0.659724 |
| 2041 | ENTRANCES_MODE | BASEMENTAREA_MODE | 0.655824 |
| 1707 | BASEMENTAREA_MODE | ENTRANCES_AVG | 0.655192 |
| 1968 | ELEVATORS_MODE | FLOORSMAX_AVG | 0.655152 |
| 2617 | BASEMENTAREA_MEDI | ENTRANCES_AVG | 0.654544 |
| 3020 | FLOORSMAX_MEDI | ELEVATORS_MODE | 0.654446 |
| 2301 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_MODE | 0.654209 |
| 2951 | ENTRANCES_MEDI | BASEMENTAREA_MODE | 0.654198 |
| 2965 | ENTRANCES_MEDI | BASEMENTAREA_MEDI | 0.653110 |
| 1117 | ENTRANCES_AVG | BASEMENTAREA_AVG | 0.653068 |
| 2635 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.652382 |
| 3225 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MEDI | 0.652122 |
| 3471 | TOTALAREA_MODE | BASEMENTAREA_MODE | 0.650148 |
| 3197 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_AVG | 0.649861 |
| 2937 | ENTRANCES_MEDI | BASEMENTAREA_AVG | 0.648946 |
| 2287 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_AVG | 0.648856 |
| 1377 | LIVINGAPARTMENTS_AVG | BASEMENTAREA_AVG | 0.647472 |
| 2621 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.647065 |
| 2631 | BASEMENTAREA_MEDI | ENTRANCES_MODE | 0.632527 |
| 3463 | TOTALAREA_MODE | FLOORSMAX_AVG | 0.632427 |
| 1448 | LIVINGAREA_AVG | FLOORSMAX_AVG | 0.630578 |
| 3491 | TOTALAREA_MODE | FLOORSMAX_MEDI | 0.629713 |
| 3211 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MODE | 0.629397 |
| 3012 | FLOORSMAX_MEDI | LIVINGAREA_AVG | 0.628444 |
| 2027 | ENTRANCES_MODE | BASEMENTAREA_AVG | 0.627951 |
| 3268 | LIVINGAREA_MEDI | FLOORSMAX_AVG | 0.627410 |
| 3296 | LIVINGAREA_MEDI | FLOORSMAX_MEDI | 0.626521 |
| 2102 | FLOORSMAX_MODE | LIVINGAREA_AVG | 0.625773 |
| 3477 | TOTALAREA_MODE | FLOORSMAX_MODE | 0.625002 |
| 1711 | BASEMENTAREA_MODE | LIVINGAPARTMENTS_AVG | 0.624275 |
| 3282 | LIVINGAREA_MEDI | FLOORSMAX_MODE | 0.624138 |
| 2371 | LIVINGAREA_MODE | ENTRANCES_MODE | 0.623087 |
| 2357 | LIVINGAREA_MODE | ENTRANCES_AVG | 0.623077 |
| 2961 | ENTRANCES_MEDI | LIVINGAREA_MODE | 0.622534 |
| 3267 | LIVINGAREA_MEDI | ENTRANCES_AVG | 0.619274 |
| 3295 | LIVINGAREA_MEDI | ENTRANCES_MEDI | 0.618969 |
| 1447 | LIVINGAREA_AVG | ENTRANCES_AVG | 0.618727 |
| 1181 | FLOORSMAX_AVG | APARTMENTS_AVG | 0.618204 |
| 3001 | FLOORSMAX_MEDI | APARTMENTS_AVG | 0.615957 |
| 2947 | ENTRANCES_MEDI | LIVINGAREA_AVG | 0.614989 |
| 2040 | ENTRANCES_MODE | APARTMENTS_MODE | 0.614956 |
| 2553 | APARTMENTS_MEDI | FLOORSMAX_AVG | 0.614953 |
| 3029 | FLOORSMAX_MEDI | APARTMENTS_MEDI | 0.614018 |
| 2091 | FLOORSMAX_MODE | APARTMENTS_AVG | 0.613594 |
| 2950 | ENTRANCES_MEDI | APARTMENTS_MODE | 0.612075 |
| 2567 | APARTMENTS_MEDI | FLOORSMAX_MODE | 0.611887 |
| 1642 | APARTMENTS_MODE | ENTRANCES_AVG | 0.611844 |
| 2552 | APARTMENTS_MEDI | ENTRANCES_AVG | 0.611029 |
| 2964 | ENTRANCES_MEDI | APARTMENTS_MEDI | 0.610880 |
| 1116 | ENTRANCES_AVG | APARTMENTS_AVG | 0.610671 |
| 2936 | ENTRANCES_MEDI | APARTMENTS_AVG | 0.607111 |
| 2372 | LIVINGAREA_MODE | FLOORSMAX_MODE | 0.605688 |
get_corr_matrix(dataset = pd_loan_train[list_var_continuous],
metodo='spearman', size_figure=[10,8])
0
También hemos hecho la matriz de spearman. Ambas matrices proporcionan resultados muy parecidos, por lo que llegamos a la conclusión de que es necesario realizar una simplificación de las variables para evitar problemas en el modelado, ya que muchas de nuestras variables no nos están aportando un valor añadido y nos hace el trabajo más complejo.
corr = pd_loan_train[list_var_continuous].corr('spearman')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1)
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 2744 | YEARS_BUILD_MEDI | YEARS_BUILD_AVG | 0.998428 |
| 2678 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_AVG | 0.997466 |
| 3694 | OBS_60_CNT_SOCIAL_CIRCLE | OBS_30_CNT_SOCIAL_CIRCLE | 0.997344 |
| 3074 | FLOORSMIN_MEDI | FLOORSMIN_AVG | 0.996217 |
| 3206 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.996197 |
| 3140 | LANDAREA_MEDI | LANDAREA_AVG | 0.996097 |
| 2810 | COMMONAREA_MEDI | COMMONAREA_AVG | 0.995745 |
| 3272 | LIVINGAREA_MEDI | LIVINGAREA_AVG | 0.995526 |
| 2546 | APARTMENTS_MEDI | APARTMENTS_AVG | 0.995280 |
| 3008 | FLOORSMAX_MEDI | FLOORSMAX_AVG | 0.994909 |
| 2612 | BASEMENTAREA_MEDI | BASEMENTAREA_AVG | 0.994903 |
| 2942 | ENTRANCES_MEDI | ENTRANCES_AVG | 0.993446 |
| 2876 | ELEVATORS_MEDI | ELEVATORS_AVG | 0.990981 |
| 2758 | YEARS_BUILD_MEDI | YEARS_BUILD_MODE | 0.988085 |
| 1834 | YEARS_BUILD_MODE | YEARS_BUILD_AVG | 0.987839 |
| 923 | YEARS_BUILD_AVG | YEARS_BEGINEXPLUATATION_AVG | 0.987054 |
| 3022 | FLOORSMAX_MEDI | FLOORSMAX_MODE | 0.986370 |
| 2771 | YEARS_BUILD_MEDI | YEARS_BEGINEXPLUATATION_MEDI | 0.985982 |
| 3088 | FLOORSMIN_MEDI | FLOORSMIN_MODE | 0.985881 |
| 2743 | YEARS_BUILD_MEDI | YEARS_BEGINEXPLUATATION_AVG | 0.985650 |
| 2692 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_MODE | 0.985505 |
| 1768 | YEARS_BEGINEXPLUATATION_MODE | YEARS_BEGINEXPLUATATION_AVG | 0.985173 |
| 196 | AMT_GOODS_PRICE | AMT_CREDIT | 0.984943 |
| 2679 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_AVG | 0.984562 |
| 2164 | FLOORSMIN_MODE | FLOORSMIN_AVG | 0.982143 |
| 3338 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_AVG | 0.981957 |
| 2098 | FLOORSMAX_MODE | FLOORSMAX_AVG | 0.981609 |
| 3404 | NONLIVINGAREA_MEDI | NONLIVINGAREA_AVG | 0.981065 |
| 1847 | YEARS_BUILD_MODE | YEARS_BEGINEXPLUATATION_MODE | 0.980198 |
| 2890 | ELEVATORS_MEDI | ELEVATORS_MODE | 0.977569 |
| 1833 | YEARS_BUILD_MODE | YEARS_BEGINEXPLUATATION_AVG | 0.975319 |
| 2693 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MODE | 0.974900 |
| 1376 | LIVINGAPARTMENTS_AVG | APARTMENTS_AVG | 0.973931 |
| 3220 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.973522 |
| 3224 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MEDI | 0.971352 |
| 3154 | LANDAREA_MEDI | LANDAREA_MODE | 0.970759 |
| 3196 | LIVINGAPARTMENTS_MEDI | APARTMENTS_AVG | 0.970180 |
| 2824 | COMMONAREA_MEDI | COMMONAREA_MODE | 0.970113 |
| 2560 | APARTMENTS_MEDI | APARTMENTS_MODE | 0.970000 |
| 2296 | LIVINGAPARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.969626 |
| 2757 | YEARS_BUILD_MEDI | YEARS_BEGINEXPLUATATION_MODE | 0.969480 |
| 1769 | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_AVG | 0.969186 |
| 3286 | LIVINGAREA_MEDI | LIVINGAREA_MODE | 0.969180 |
| 1966 | ELEVATORS_MODE | ELEVATORS_AVG | 0.968642 |
| 2556 | APARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.968240 |
| 2230 | LANDAREA_MODE | LANDAREA_AVG | 0.967052 |
| 1636 | APARTMENTS_MODE | APARTMENTS_AVG | 0.964885 |
| 2956 | ENTRANCES_MEDI | ENTRANCES_MODE | 0.964759 |
| 1900 | COMMONAREA_MODE | COMMONAREA_AVG | 0.964737 |
| 2362 | LIVINGAREA_MODE | LIVINGAREA_AVG | 0.964263 |
| 2626 | BASEMENTAREA_MEDI | BASEMENTAREA_MODE | 0.963546 |
| 1702 | BASEMENTAREA_MODE | BASEMENTAREA_AVG | 0.960061 |
| 2032 | ENTRANCES_MODE | ENTRANCES_AVG | 0.957524 |
| 2300 | LIVINGAPARTMENTS_MODE | APARTMENTS_MODE | 0.956511 |
| 3352 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_MODE | 0.950325 |
| 2570 | APARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.947595 |
| 2286 | LIVINGAPARTMENTS_MODE | APARTMENTS_AVG | 0.944239 |
| 3418 | NONLIVINGAREA_MEDI | NONLIVINGAREA_MODE | 0.941355 |
| 3467 | TOTALAREA_MODE | LIVINGAREA_AVG | 0.939504 |
| 3495 | TOTALAREA_MODE | LIVINGAREA_MEDI | 0.935012 |
| 3210 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MODE | 0.933764 |
| 2428 | NONLIVINGAPARTMENTS_MODE | NONLIVINGAPARTMENTS_AVG | 0.930926 |
| 1646 | APARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.929945 |
| 2494 | NONLIVINGAREA_MODE | NONLIVINGAREA_AVG | 0.918045 |
| 3481 | TOTALAREA_MODE | LIVINGAREA_MODE | 0.917397 |
| 1441 | LIVINGAREA_AVG | APARTMENTS_AVG | 0.905822 |
| 3289 | LIVINGAREA_MEDI | APARTMENTS_MEDI | 0.904069 |
| 3261 | LIVINGAREA_MEDI | APARTMENTS_AVG | 0.901992 |
| 2557 | APARTMENTS_MEDI | LIVINGAREA_AVG | 0.901102 |
| 3456 | TOTALAREA_MODE | APARTMENTS_AVG | 0.899416 |
| 1451 | LIVINGAREA_AVG | LIVINGAPARTMENTS_AVG | 0.896571 |
| 3484 | TOTALAREA_MODE | APARTMENTS_MEDI | 0.894394 |
| 3299 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MEDI | 0.893339 |
| 2365 | LIVINGAREA_MODE | APARTMENTS_MODE | 0.893206 |
| 3207 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_AVG | 0.892882 |
| 3271 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.891525 |
| 2571 | APARTMENTS_MEDI | LIVINGAREA_MODE | 0.877484 |
| 3466 | TOTALAREA_MODE | LIVINGAPARTMENTS_AVG | 0.875855 |
| 3470 | TOTALAREA_MODE | APARTMENTS_MODE | 0.875727 |
| 2375 | LIVINGAREA_MODE | LIVINGAPARTMENTS_MODE | 0.875044 |
| 3275 | LIVINGAREA_MEDI | APARTMENTS_MODE | 0.874660 |
| 2351 | LIVINGAREA_MODE | APARTMENTS_AVG | 0.873569 |
| 3494 | TOTALAREA_MODE | LIVINGAPARTMENTS_MEDI | 0.871975 |
| 1647 | APARTMENTS_MODE | LIVINGAREA_AVG | 0.870164 |
| 3285 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.869272 |
| 2297 | LIVINGAPARTMENTS_MODE | LIVINGAREA_AVG | 0.866421 |
| 3221 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MODE | 0.858586 |
| 2361 | LIVINGAREA_MODE | LIVINGAPARTMENTS_AVG | 0.855480 |
| 3480 | TOTALAREA_MODE | LIVINGAPARTMENTS_MODE | 0.853377 |
| 1186 | FLOORSMAX_AVG | ELEVATORS_AVG | 0.849617 |
| 3034 | FLOORSMAX_MEDI | ELEVATORS_MEDI | 0.845770 |
| 2878 | ELEVATORS_MEDI | FLOORSMAX_AVG | 0.844668 |
| 3760 | DEF_60_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | 0.844608 |
| 3006 | FLOORSMAX_MEDI | ELEVATORS_AVG | 0.842088 |
| 2110 | FLOORSMAX_MODE | ELEVATORS_MODE | 0.833181 |
| 3020 | FLOORSMAX_MEDI | ELEVATORS_MODE | 0.830540 |
| 131 | AMT_ANNUITY | AMT_CREDIT | 0.830277 |
| 1968 | ELEVATORS_MODE | FLOORSMAX_AVG | 0.829479 |
| 197 | AMT_GOODS_PRICE | AMT_ANNUITY | 0.828031 |
| 2892 | ELEVATORS_MEDI | FLOORSMAX_MODE | 0.825961 |
| 2096 | FLOORSMAX_MODE | ELEVATORS_AVG | 0.822037 |
| 1448 | LIVINGAREA_AVG | FLOORSMAX_AVG | 0.782872 |
| 3012 | FLOORSMAX_MEDI | LIVINGAREA_AVG | 0.781536 |
| 3296 | LIVINGAREA_MEDI | FLOORSMAX_MEDI | 0.779596 |
| 3268 | LIVINGAREA_MEDI | FLOORSMAX_AVG | 0.777663 |
| 3463 | TOTALAREA_MODE | FLOORSMAX_AVG | 0.776928 |
| 2102 | FLOORSMAX_MODE | LIVINGAREA_AVG | 0.776025 |
| 3491 | TOTALAREA_MODE | FLOORSMAX_MEDI | 0.775981 |
| 3477 | TOTALAREA_MODE | FLOORSMAX_MODE | 0.774683 |
| 3282 | LIVINGAREA_MEDI | FLOORSMAX_MODE | 0.774643 |
| 2372 | LIVINGAREA_MODE | FLOORSMAX_MODE | 0.760557 |
| 1181 | FLOORSMAX_AVG | APARTMENTS_AVG | 0.756137 |
| 3001 | FLOORSMAX_MEDI | APARTMENTS_AVG | 0.754862 |
| 3029 | FLOORSMAX_MEDI | APARTMENTS_MEDI | 0.752127 |
| 2091 | FLOORSMAX_MODE | APARTMENTS_AVG | 0.750924 |
| 2553 | APARTMENTS_MEDI | FLOORSMAX_AVG | 0.750381 |
| 2567 | APARTMENTS_MEDI | FLOORSMAX_MODE | 0.748658 |
| 3026 | FLOORSMAX_MEDI | LIVINGAREA_MODE | 0.747296 |
| 2358 | LIVINGAREA_MODE | FLOORSMAX_AVG | 0.744226 |
| 2105 | FLOORSMAX_MODE | APARTMENTS_MODE | 0.734853 |
| 1446 | LIVINGAREA_AVG | ELEVATORS_AVG | 0.728249 |
| 1383 | LIVINGAPARTMENTS_AVG | FLOORSMAX_AVG | 0.725546 |
| 2882 | ELEVATORS_MEDI | LIVINGAREA_AVG | 0.723934 |
| 3011 | FLOORSMAX_MEDI | LIVINGAPARTMENTS_AVG | 0.723254 |
| 3231 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_MEDI | 0.721116 |
| 3294 | LIVINGAREA_MEDI | ELEVATORS_MEDI | 0.720893 |
| 3203 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_AVG | 0.720772 |
| 1442 | LIVINGAREA_AVG | BASEMENTAREA_AVG | 0.720061 |
| 3015 | FLOORSMAX_MEDI | APARTMENTS_MODE | 0.719711 |
| 3266 | LIVINGAREA_MEDI | ELEVATORS_AVG | 0.719150 |
| 2101 | FLOORSMAX_MODE | LIVINGAPARTMENTS_AVG | 0.717936 |
| 1643 | APARTMENTS_MODE | FLOORSMAX_AVG | 0.717124 |
| 3262 | LIVINGAREA_MEDI | BASEMENTAREA_AVG | 0.717041 |
| 3461 | TOTALAREA_MODE | ELEVATORS_AVG | 0.716815 |
| 3217 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_MODE | 0.716248 |
| 3290 | LIVINGAREA_MEDI | BASEMENTAREA_MEDI | 0.715917 |
| 2622 | BASEMENTAREA_MEDI | LIVINGAREA_AVG | 0.714122 |
| 3489 | TOTALAREA_MODE | ELEVATORS_MEDI | 0.714046 |
| 3457 | TOTALAREA_MODE | BASEMENTAREA_AVG | 0.712283 |
| 1972 | ELEVATORS_MODE | LIVINGAREA_AVG | 0.709392 |
| 3280 | LIVINGAREA_MEDI | ELEVATORS_MODE | 0.708131 |
| 3485 | TOTALAREA_MODE | BASEMENTAREA_MEDI | 0.706017 |
| 3475 | TOTALAREA_MODE | ELEVATORS_MODE | 0.705510 |
| 2366 | LIVINGAREA_MODE | BASEMENTAREA_MODE | 0.703925 |
| 2307 | LIVINGAPARTMENTS_MODE | FLOORSMAX_MODE | 0.702851 |
| 2507 | NONLIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | 0.699992 |
| 2636 | BASEMENTAREA_MEDI | LIVINGAREA_MODE | 0.699436 |
| 791 | BASEMENTAREA_AVG | APARTMENTS_AVG | 0.699264 |
| 2352 | LIVINGAREA_MODE | BASEMENTAREA_AVG | 0.698285 |
| 2547 | APARTMENTS_MEDI | BASEMENTAREA_AVG | 0.696681 |
| 1051 | ELEVATORS_AVG | APARTMENTS_AVG | 0.696164 |
| 2639 | BASEMENTAREA_MEDI | APARTMENTS_MEDI | 0.696086 |
| 2611 | BASEMENTAREA_MEDI | APARTMENTS_AVG | 0.693750 |
| 1377 | LIVINGAPARTMENTS_AVG | BASEMENTAREA_AVG | 0.693211 |
| 3025 | FLOORSMAX_MEDI | LIVINGAPARTMENTS_MODE | 0.692947 |
| 2871 | ELEVATORS_MEDI | APARTMENTS_AVG | 0.692913 |
| 1381 | LIVINGAPARTMENTS_AVG | ELEVATORS_AVG | 0.692051 |
| 3197 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_AVG | 0.691784 |
| 2293 | LIVINGAPARTMENTS_MODE | FLOORSMAX_AVG | 0.690693 |
| 3431 | NONLIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | 0.690464 |
| 3225 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MEDI | 0.690001 |
| 2899 | ELEVATORS_MEDI | APARTMENTS_MEDI | 0.689852 |
| 2370 | LIVINGAREA_MODE | ELEVATORS_MODE | 0.688240 |
| 2881 | ELEVATORS_MEDI | LIVINGAPARTMENTS_AVG | 0.688170 |
| 1583 | NONLIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | 0.687643 |
| 2621 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.687392 |
| 2551 | APARTMENTS_MEDI | ELEVATORS_AVG | 0.687294 |
| 3229 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MEDI | 0.686716 |
| 3201 | LIVINGAPARTMENTS_MEDI | ELEVATORS_AVG | 0.685277 |
| 1715 | BASEMENTAREA_MODE | APARTMENTS_MODE | 0.683802 |
| 1961 | ELEVATORS_MODE | APARTMENTS_AVG | 0.680626 |
| 3403 | NONLIVINGAREA_MEDI | NONLIVINGAPARTMENTS_AVG | 0.679343 |
| 2565 | APARTMENTS_MEDI | ELEVATORS_MODE | 0.679235 |
| 2625 | BASEMENTAREA_MEDI | APARTMENTS_MODE | 0.678549 |
| 2287 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_AVG | 0.678028 |
| 2635 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.677939 |
| 1637 | APARTMENTS_MODE | BASEMENTAREA_AVG | 0.677427 |
| 3276 | LIVINGAREA_MEDI | BASEMENTAREA_MODE | 0.677232 |
| 1971 | ELEVATORS_MODE | LIVINGAPARTMENTS_AVG | 0.674387 |
| 2896 | ELEVATORS_MEDI | LIVINGAREA_MODE | 0.674219 |
| 3339 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_AVG | 0.673805 |
| 3215 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MODE | 0.673757 |
| 2301 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_MODE | 0.673746 |
| 3471 | TOTALAREA_MODE | BASEMENTAREA_MODE | 0.673084 |
| 1712 | BASEMENTAREA_MODE | LIVINGAREA_AVG | 0.671991 |
| 2356 | LIVINGAREA_MODE | ELEVATORS_AVG | 0.667969 |
| 1253 | FLOORSMIN_AVG | FLOORSMAX_AVG | 0.667848 |
| 3009 | FLOORSMAX_MEDI | FLOORSMIN_AVG | 0.664254 |
| 3101 | FLOORSMIN_MEDI | FLOORSMAX_MEDI | 0.663769 |
| 3073 | FLOORSMIN_MEDI | FLOORSMAX_AVG | 0.662099 |
| 1975 | ELEVATORS_MODE | APARTMENTS_MODE | 0.661488 |
| 3353 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MODE | 0.661417 |
| 2561 | APARTMENTS_MEDI | BASEMENTAREA_MODE | 0.660128 |
| 1117 | ENTRANCES_AVG | BASEMENTAREA_AVG | 0.659801 |
| 2305 | LIVINGAPARTMENTS_MODE | ELEVATORS_MODE | 0.659662 |
| 2617 | BASEMENTAREA_MEDI | ENTRANCES_AVG | 0.659516 |
| 3417 | NONLIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MODE | 0.658267 |
| 2965 | ENTRANCES_MEDI | BASEMENTAREA_MEDI | 0.658100 |
| 1701 | BASEMENTAREA_MODE | APARTMENTS_AVG | 0.654398 |
| 2099 | FLOORSMAX_MODE | FLOORSMIN_AVG | 0.653345 |
| 3087 | FLOORSMIN_MEDI | FLOORSMAX_MODE | 0.653094 |
| 2937 | ENTRANCES_MEDI | BASEMENTAREA_AVG | 0.652333 |
| 2895 | ELEVATORS_MEDI | LIVINGAPARTMENTS_MODE | 0.649293 |
| 3211 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MODE | 0.648081 |
| 2041 | ENTRANCES_MODE | BASEMENTAREA_MODE | 0.645885 |
| 2177 | FLOORSMIN_MODE | FLOORSMAX_MODE | 0.645556 |
| 2291 | LIVINGAPARTMENTS_MODE | ELEVATORS_AVG | 0.644567 |
| 2493 | NONLIVINGAREA_MODE | NONLIVINGAPARTMENTS_AVG | 0.644226 |
| 2885 | ELEVATORS_MEDI | APARTMENTS_MODE | 0.643921 |
| 1711 | BASEMENTAREA_MODE | LIVINGAPARTMENTS_AVG | 0.643555 |
| 2951 | ENTRANCES_MEDI | BASEMENTAREA_MODE | 0.641848 |
| 1707 | BASEMENTAREA_MODE | ENTRANCES_AVG | 0.641251 |
| 3023 | FLOORSMAX_MEDI | FLOORSMIN_MODE | 0.640916 |
| 1641 | APARTMENTS_MODE | ELEVATORS_AVG | 0.638622 |
| 2163 | FLOORSMIN_MODE | FLOORSMAX_AVG | 0.638591 |
| 2429 | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_AVG | 0.637092 |
| 3460 | TOTALAREA_MODE | COMMONAREA_AVG | 0.623052 |
| 2631 | BASEMENTAREA_MEDI | ENTRANCES_MODE | 0.621345 |
| 3488 | TOTALAREA_MODE | COMMONAREA_MEDI | 0.618942 |
| 2027 | ENTRANCES_MODE | BASEMENTAREA_AVG | 0.614406 |
| 1447 | LIVINGAREA_AVG | ENTRANCES_AVG | 0.608158 |
| 3267 | LIVINGAREA_MEDI | ENTRANCES_AVG | 0.607489 |
| 3295 | LIVINGAREA_MEDI | ENTRANCES_MEDI | 0.604612 |
| 2357 | LIVINGAREA_MODE | ENTRANCES_AVG | 0.601575 |
| 2961 | ENTRANCES_MEDI | LIVINGAREA_MODE | 0.600879 |
TRATAMIENTO DE LOS VALORES NULOS
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_loan[i].isnull().sum() > 0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]
.value_counts(normalize=True).reset_index()).T
# Verifica cuántas columnas tiene pd_concat_percent antes de asignarlas
if pd_concat_percent.shape[1] >= 2:
pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
pd_concat_percent.iloc[0, 1]]
elif pd_concat_percent.shape[1] == 1:
pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0]]
pd_concat_percent = pd_concat_percent.drop(target, axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum() / pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
get_percent_null_values_target(pd_loan_train, list_var_continuous, target='TARGET')
| 0.0 | variable | sum_null_values | porcentaje_sum_null_values | 1.0 | |
|---|---|---|---|---|---|
| 0 | 1.000000 | AMT_ANNUITY | 6 | 0.000024 | NaN |
| 1 | 0.934498 | AMT_GOODS_PRICE | 229 | 0.000931 | 0.065502 |
| 2 | 0.915038 | OWN_CAR_AGE | 162531 | 0.660674 | 0.084962 |
| 3 | 1.000000 | CNT_FAM_MEMBERS | 2 | 0.000008 | NaN |
| 4 | 0.914876 | EXT_SOURCE_1 | 138763 | 0.564059 | 0.085124 |
| 5 | 0.922652 | EXT_SOURCE_2 | 543 | 0.002207 | 0.077348 |
| 6 | 0.906754 | EXT_SOURCE_3 | 48828 | 0.198481 | 0.093246 |
| 7 | 0.908370 | APARTMENTS_AVG | 124643 | 0.506662 | 0.091630 |
| 8 | 0.910746 | BASEMENTAREA_AVG | 143736 | 0.584274 | 0.089254 |
| 9 | 0.907886 | YEARS_BEGINEXPLUATATION_AVG | 119787 | 0.486923 | 0.092114 |
| 10 | 0.913042 | YEARS_BUILD_AVG | 163539 | 0.664771 | 0.086958 |
| 11 | 0.914184 | COMMONAREA_AVG | 171857 | 0.698583 | 0.085816 |
| 12 | 0.908915 | ELEVATORS_AVG | 130922 | 0.532186 | 0.091085 |
| 13 | 0.908148 | ENTRANCES_AVG | 123633 | 0.502557 | 0.091852 |
| 14 | 0.908045 | FLOORSMAX_AVG | 122179 | 0.496646 | 0.091955 |
| 15 | 0.913584 | FLOORSMIN_AVG | 166856 | 0.678254 | 0.086416 |
| 16 | 0.911689 | LANDAREA_AVG | 145825 | 0.592765 | 0.088311 |
| 17 | 0.913681 | LIVINGAPARTMENTS_AVG | 168063 | 0.683161 | 0.086319 |
| 18 | 0.908475 | LIVINGAREA_AVG | 123256 | 0.501024 | 0.091525 |
| 19 | 0.914042 | NONLIVINGAPARTMENTS_AVG | 170757 | 0.694112 | 0.085958 |
| 20 | 0.909545 | NONLIVINGAREA_AVG | 135493 | 0.550767 | 0.090455 |
| 21 | 0.908370 | APARTMENTS_MODE | 124643 | 0.506662 | 0.091630 |
| 22 | 0.910746 | BASEMENTAREA_MODE | 143736 | 0.584274 | 0.089254 |
| 23 | 0.907886 | YEARS_BEGINEXPLUATATION_MODE | 119787 | 0.486923 | 0.092114 |
| 24 | 0.913042 | YEARS_BUILD_MODE | 163539 | 0.664771 | 0.086958 |
| 25 | 0.914184 | COMMONAREA_MODE | 171857 | 0.698583 | 0.085816 |
| 26 | 0.908915 | ELEVATORS_MODE | 130922 | 0.532186 | 0.091085 |
| 27 | 0.908148 | ENTRANCES_MODE | 123633 | 0.502557 | 0.091852 |
| 28 | 0.908045 | FLOORSMAX_MODE | 122179 | 0.496646 | 0.091955 |
| 29 | 0.913584 | FLOORSMIN_MODE | 166856 | 0.678254 | 0.086416 |
| 30 | 0.911689 | LANDAREA_MODE | 145825 | 0.592765 | 0.088311 |
| 31 | 0.913681 | LIVINGAPARTMENTS_MODE | 168063 | 0.683161 | 0.086319 |
| 32 | 0.908475 | LIVINGAREA_MODE | 123256 | 0.501024 | 0.091525 |
| 33 | 0.914042 | NONLIVINGAPARTMENTS_MODE | 170757 | 0.694112 | 0.085958 |
| 34 | 0.909545 | NONLIVINGAREA_MODE | 135493 | 0.550767 | 0.090455 |
| 35 | 0.908370 | APARTMENTS_MEDI | 124643 | 0.506662 | 0.091630 |
| 36 | 0.910746 | BASEMENTAREA_MEDI | 143736 | 0.584274 | 0.089254 |
| 37 | 0.907886 | YEARS_BEGINEXPLUATATION_MEDI | 119787 | 0.486923 | 0.092114 |
| 38 | 0.913042 | YEARS_BUILD_MEDI | 163539 | 0.664771 | 0.086958 |
| 39 | 0.914184 | COMMONAREA_MEDI | 171857 | 0.698583 | 0.085816 |
| 40 | 0.908915 | ELEVATORS_MEDI | 130922 | 0.532186 | 0.091085 |
| 41 | 0.908148 | ENTRANCES_MEDI | 123633 | 0.502557 | 0.091852 |
| 42 | 0.908045 | FLOORSMAX_MEDI | 122179 | 0.496646 | 0.091955 |
| 43 | 0.913584 | FLOORSMIN_MEDI | 166856 | 0.678254 | 0.086416 |
| 44 | 0.911689 | LANDAREA_MEDI | 145825 | 0.592765 | 0.088311 |
| 45 | 0.913681 | LIVINGAPARTMENTS_MEDI | 168063 | 0.683161 | 0.086319 |
| 46 | 0.908475 | LIVINGAREA_MEDI | 123256 | 0.501024 | 0.091525 |
| 47 | 0.914042 | NONLIVINGAPARTMENTS_MEDI | 170757 | 0.694112 | 0.085958 |
| 48 | 0.909545 | NONLIVINGAREA_MEDI | 135493 | 0.550767 | 0.090455 |
| 49 | 0.907624 | TOTALAREA_MODE | 118526 | 0.481797 | 0.092376 |
| 50 | 0.963592 | OBS_30_CNT_SOCIAL_CIRCLE | 824 | 0.003349 | 0.036408 |
| 51 | 0.963592 | DEF_30_CNT_SOCIAL_CIRCLE | 824 | 0.003349 | 0.036408 |
| 52 | 0.963592 | OBS_60_CNT_SOCIAL_CIRCLE | 824 | 0.003349 | 0.036408 |
| 53 | 0.963592 | DEF_60_CNT_SOCIAL_CIRCLE | 824 | 0.003349 | 0.036408 |
| 54 | 0.896084 | AMT_REQ_CREDIT_BUREAU_HOUR | 33248 | 0.135150 | 0.103916 |
| 55 | 0.896084 | AMT_REQ_CREDIT_BUREAU_DAY | 33248 | 0.135150 | 0.103916 |
| 56 | 0.896084 | AMT_REQ_CREDIT_BUREAU_WEEK | 33248 | 0.135150 | 0.103916 |
| 57 | 0.896084 | AMT_REQ_CREDIT_BUREAU_MON | 33248 | 0.135150 | 0.103916 |
| 58 | 0.896084 | AMT_REQ_CREDIT_BUREAU_QRT | 33248 | 0.135150 | 0.103916 |
| 59 | 0.896084 | AMT_REQ_CREDIT_BUREAU_YEAR | 33248 | 0.135150 | 0.103916 |
Hemos decidido que nuestro umbral de valores nulos se situará en el 50%, es decir, sólo usaremos aquellas variables que cuenten con menos de un 50% de valores nulos. Haciendo este filtrado, creamos un nuevo df llamado pd_loan_train_filtered.
threshold = 0.5
list_vars_not_null = list(pd_null_columnas[pd_null_columnas['porcentaje_columnas'] < threshold].index)
pd_loan_train_filtered = pd_loan_train.loc[:, list_vars_not_null]
print(pd_loan_train_filtered)
FLOORSMAX_MEDI FLOORSMAX_AVG FLOORSMAX_MODE \
SK_ID_CURR
429844 NaN NaN NaN
259562 NaN NaN NaN
188975 NaN NaN NaN
254199 0.7083 0.6917 0.6250
332686 0.3333 0.3333 0.3333
... ... ... ...
191213 0.3333 0.3333 0.3333
383329 NaN NaN NaN
177037 NaN NaN NaN
358151 0.4583 0.4583 0.4583
277125 0.3333 0.3333 0.3333
YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE \
SK_ID_CURR
429844 NaN NaN
259562 NaN NaN
188975 NaN NaN
254199 0.9990 0.9985
332686 0.9861 0.9861
... ... ...
191213 0.9891 0.9891
383329 NaN NaN
177037 NaN NaN
358151 0.9896 0.9896
277125 0.9861 0.9861
YEARS_BEGINEXPLUATATION_AVG TOTALAREA_MODE EMERGENCYSTATE_MODE \
SK_ID_CURR
429844 NaN NaN NaN
259562 NaN NaN NaN
188975 NaN NaN NaN
254199 0.9985 0.1849 No
332686 0.9861 0.2577 No
... ... ... ...
191213 0.9891 0.0988 No
383329 NaN NaN NaN
177037 NaN NaN NaN
358151 0.9896 0.0880 No
277125 0.9861 0.1811 No
OCCUPATION_TYPE EXT_SOURCE_3 AMT_REQ_CREDIT_BUREAU_MON \
SK_ID_CURR
429844 Laborers 0.665855 0.0
259562 Sales staff 0.808394 0.0
188975 NaN NaN 0.0
254199 High skill tech staff 0.519097 1.0
332686 Core staff 0.537070 1.0
... ... ... ...
191213 Laborers 0.837747 0.0
383329 Managers 0.358951 1.0
177037 NaN 0.843544 0.0
358151 Cooking staff 0.626304 0.0
277125 Medicine staff 0.360613 0.0
AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_YEAR \
SK_ID_CURR
429844 0.0 1.0
259562 0.0 1.0
188975 0.0 1.0
254199 0.0 0.0
332686 0.0 0.0
... ... ...
191213 0.0 3.0
383329 0.0 3.0
177037 0.0 2.0
358151 0.0 1.0
277125 0.0 5.0
AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_DAY \
SK_ID_CURR
429844 1.0 0.0
259562 0.0 0.0
188975 0.0 0.0
254199 0.0 0.0
332686 0.0 0.0
... ... ...
191213 0.0 0.0
383329 1.0 0.0
177037 0.0 0.0
358151 0.0 0.0
277125 0.0 0.0
AMT_REQ_CREDIT_BUREAU_WEEK NAME_TYPE_SUITE \
SK_ID_CURR
429844 0.0 Unaccompanied
259562 0.0 Unaccompanied
188975 0.0 Spouse, partner
254199 0.0 Unaccompanied
332686 0.0 Unaccompanied
... ... ...
191213 0.0 Unaccompanied
383329 0.0 Unaccompanied
177037 0.0 Unaccompanied
358151 0.0 Unaccompanied
277125 0.0 Unaccompanied
OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE \
SK_ID_CURR
429844 10.0 0.0
259562 1.0 0.0
188975 4.0 0.0
254199 2.0 0.0
332686 0.0 0.0
... ... ...
191213 0.0 0.0
383329 0.0 0.0
177037 3.0 1.0
358151 1.0 0.0
277125 0.0 0.0
OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE EXT_SOURCE_2 \
SK_ID_CURR
429844 10.0 0.0 0.671561
259562 1.0 0.0 0.684803
188975 4.0 0.0 0.163187
254199 2.0 0.0 0.630240
332686 0.0 0.0 0.565010
... ... ... ...
191213 0.0 0.0 0.685879
383329 0.0 0.0 0.527424
177037 3.0 0.0 0.638523
358151 1.0 0.0 0.726744
277125 0.0 0.0 0.638629
AMT_GOODS_PRICE AMT_ANNUITY CNT_FAM_MEMBERS FLAG_DOCUMENT_4 \
SK_ID_CURR
429844 450000.0 22500.0 2.0 0
259562 450000.0 26613.0 1.0 0
188975 454500.0 21672.0 3.0 0
254199 283500.0 12042.0 2.0 0
332686 679500.0 23931.0 4.0 0
... ... ... ... ...
191213 585000.0 29250.0 3.0 0
383329 900000.0 94509.0 3.0 0
177037 225000.0 26226.0 2.0 0
358151 225000.0 16852.5 2.0 0
277125 225000.0 14242.5 1.0 0
FLAG_DOCUMENT_7 FLAG_DOCUMENT_6 FLAG_DOCUMENT_5 \
SK_ID_CURR
429844 0 0 0
259562 0 0 0
188975 0 0 0
254199 0 0 0
332686 0 0 0
... ... ... ...
191213 0 0 0
383329 0 0 0
177037 0 0 0
358151 0 0 0
277125 0 0 0
FLAG_DOCUMENT_8 FLAG_DOCUMENT_12 DAYS_LAST_PHONE_CHANGE \
SK_ID_CURR
429844 0 0 -1623.0
259562 0 0 -2948.0
188975 0 0 -2064.0
254199 0 0 -1.0
332686 0 0 -602.0
... ... ... ...
191213 0 0 -2145.0
383329 0 0 -2879.0
177037 0 0 -686.0
358151 0 0 -1658.0
277125 0 0 -1807.0
FLAG_DOCUMENT_3 FLAG_DOCUMENT_2 FLAG_DOCUMENT_11 \
SK_ID_CURR
429844 0 0 0
259562 1 0 0
188975 1 0 0
254199 1 0 0
332686 1 0 0
... ... ... ...
191213 0 0 0
383329 1 0 0
177037 1 0 0
358151 1 0 0
277125 1 0 0
FLAG_DOCUMENT_21 FLAG_DOCUMENT_20 FLAG_DOCUMENT_19 \
SK_ID_CURR
429844 0 0 0
259562 0 0 0
188975 0 0 0
254199 0 0 0
332686 0 0 0
... ... ... ...
191213 0 0 0
383329 0 0 0
177037 0 0 0
358151 0 0 0
277125 0 0 0
FLAG_DOCUMENT_18 FLAG_DOCUMENT_17 FLAG_DOCUMENT_9 \
SK_ID_CURR
429844 0 0 0
259562 0 0 0
188975 0 0 0
254199 0 0 0
332686 0 0 0
... ... ... ...
191213 0 0 0
383329 0 0 0
177037 0 0 0
358151 0 0 0
277125 0 0 0
FLAG_DOCUMENT_16 FLAG_DOCUMENT_15 FLAG_DOCUMENT_14 \
SK_ID_CURR
429844 0 0 0
259562 0 0 0
188975 0 0 0
254199 1 0 0
332686 0 0 0
... ... ... ...
191213 0 0 0
383329 0 0 0
177037 0 0 0
358151 0 0 0
277125 0 0 0
FLAG_DOCUMENT_13 FLAG_DOCUMENT_10 NAME_CONTRACT_TYPE CODE_GENDER \
SK_ID_CURR
429844 0 0 Revolving loans M
259562 0 0 Cash loans F
188975 0 0 Cash loans M
254199 0 0 Cash loans F
332686 0 0 Cash loans F
... ... ... ... ...
191213 0 0 Revolving loans F
383329 0 0 Cash loans M
177037 0 0 Cash loans F
358151 0 0 Cash loans F
277125 0 0 Cash loans F
FLAG_MOBIL FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \
SK_ID_CURR
429844 1 N Y 0
259562 1 Y N 0
188975 1 N Y 1
254199 1 N N 0
332686 1 N N 2
... ... ... ... ...
191213 1 Y Y 1
383329 1 Y Y 1
177037 1 N N 0
358151 1 N Y 0
277125 1 N Y 0
AMT_INCOME_TOTAL AMT_CREDIT NAME_INCOME_TYPE \
SK_ID_CURR
429844 247500.0 450000.0 Commercial associate
259562 90000.0 521280.0 Working
188975 81000.0 508495.5 Pensioner
254199 144000.0 433188.0 Commercial associate
332686 225000.0 814041.0 Working
... ... ... ...
191213 202500.0 585000.0 Commercial associate
383329 225000.0 900000.0 Working
177037 135000.0 254700.0 Pensioner
358151 157500.0 343800.0 Working
277125 76500.0 269550.0 Working
NAME_EDUCATION_TYPE NAME_FAMILY_STATUS \
SK_ID_CURR
429844 Secondary / secondary special Married
259562 Secondary / secondary special Single / not married
188975 Lower secondary Married
254199 Secondary / secondary special Married
332686 Higher education Married
... ... ...
191213 Secondary / secondary special Civil marriage
383329 Secondary / secondary special Married
177037 Higher education Married
358151 Secondary / secondary special Civil marriage
277125 Secondary / secondary special Widow
NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH \
SK_ID_CURR
429844 House / apartment 0.046220 -19807
259562 House / apartment 0.018634 -16262
188975 House / apartment 0.031329 -11388
254199 House / apartment 0.035792 -19922
332686 House / apartment 0.025164 -14371
... ... ... ...
191213 House / apartment 0.028663 -20227
383329 House / apartment 0.025164 -17586
177037 House / apartment 0.030755 -22541
358151 House / apartment 0.018850 -9328
277125 House / apartment 0.018634 -14036
DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH FLAG_EMP_PHONE \
SK_ID_CURR
429844 -111 -2222.0 -3368 1
259562 -4284 -5854.0 -4476 1
188975 365243 -4881.0 -3940 0
254199 -9018 -293.0 -3123 1
332686 -785 -1581.0 -3679 1
... ... ... ... ...
191213 -1973 -4085.0 -2791 1
383329 -2545 -3463.0 -1147 1
177037 365243 -7885.0 -3800 0
358151 -959 -7701.0 -2022 1
277125 -5056 -529.0 -4577 1
ORGANIZATION_TYPE FLAG_WORK_PHONE FLAG_CONT_MOBILE \
SK_ID_CURR
429844 Business Entity Type 2 0 1
259562 Self-employed 1 1
188975 XNA 0 1
254199 Business Entity Type 2 0 1
332686 Realtor 1 1
... ... ... ...
191213 Business Entity Type 3 0 1
383329 Self-employed 0 1
177037 XNA 0 1
358151 Business Entity Type 3 0 1
277125 Government 1 1
FLAG_PHONE FLAG_EMAIL REGION_RATING_CLIENT \
SK_ID_CURR
429844 0 0 1
259562 0 0 2
188975 0 0 2
254199 1 0 2
332686 0 0 2
... ... ... ...
191213 0 0 2
383329 0 0 2
177037 0 0 2
358151 0 0 2
277125 1 0 2
REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START \
SK_ID_CURR
429844 1 SUNDAY
259562 2 TUESDAY
188975 2 THURSDAY
254199 2 TUESDAY
332686 2 TUESDAY
... ... ...
191213 2 TUESDAY
383329 2 MONDAY
177037 2 THURSDAY
358151 2 WEDNESDAY
277125 2 TUESDAY
HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION \
SK_ID_CURR
429844 16 0
259562 16 0
188975 15 0
254199 19 0
332686 10 0
... ... ...
191213 16 0
383329 12 0
177037 11 0
358151 13 0
277125 12 0
REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION \
SK_ID_CURR
429844 0 0
259562 0 0
188975 0 0
254199 0 0
332686 0 0
... ... ...
191213 0 0
383329 0 0
177037 0 0
358151 0 0
277125 0 0
REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY \
SK_ID_CURR
429844 0 1
259562 0 0
188975 0 0
254199 0 0
332686 0 0
... ... ...
191213 0 0
383329 0 0
177037 0 0
358151 0 0
277125 0 0
LIVE_CITY_NOT_WORK_CITY TARGET
SK_ID_CURR
429844 1 0
259562 0 0
188975 0 0
254199 0 0
332686 0 0
... ... ...
191213 0 0
383329 0 0
177037 0 0
358151 0 0
277125 0 0
[246008 rows x 80 columns]
pd_loan_train['AMT_GOODS_PRICE'].fillna(pd_loan_train['AMT_GOODS_PRICE'].median(), inplace=True)
pd_loan_train['AMT_ANNUITY'].fillna(pd_loan_train['AMT_ANNUITY'].median(), inplace=True)
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\3300765175.py:1: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\3300765175.py:2: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
TRATAMIENTO VARIABLES CATEGORICAS
list_var_cat
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
OPCIÓN 2
pd_loan[list_var_continuous] = pd_loan[list_var_continuous].apply(lambda x: x.fillna(x.median()))
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
TARGET 0 1 TARGET 0 226148 0 1 0 19860
0.9999726127135284
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
cramers_v(confusion_matrix.values)
0.9999726127135284
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["OWN_CAR_AGE"])
cramers_v(confusion_matrix.values)
0.06001203138415476
pd_loan_train[list_var_cat] = pd_loan_train[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
pd_loan_test[list_var_cat] = pd_loan_test[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
print(pd_loan_test)
NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY \
SK_ID_CURR
112958 Cash loans F N N
238225 Cash loans F N Y
269335 Cash loans F N Y
450836 Cash loans M Y Y
340566 Revolving loans F Y Y
... ... ... ... ...
235730 Cash loans F N N
367911 Cash loans M Y N
302585 Cash loans M Y N
102961 Cash loans F N Y
125815 Cash loans M N Y
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY \
SK_ID_CURR
112958 0 92592.0 521280.0 19984.5
238225 1 81000.0 238500.0 13819.5
269335 0 81000.0 276277.5 13419.0
450836 0 112500.0 1058197.5 38137.5
340566 0 90000.0 495000.0 24750.0
... ... ... ... ...
235730 1 270000.0 1711368.0 70758.0
367911 0 112500.0 571446.0 16506.0
302585 0 171000.0 706410.0 67203.0
102961 0 72000.0 1129500.0 31059.0
125815 0 121500.0 630000.0 32296.5
AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE \
SK_ID_CURR
112958 450000.0 Unaccompanied Commercial associate
238225 238500.0 Unaccompanied Commercial associate
269335 238500.0 Unaccompanied Pensioner
450836 913500.0 Family Pensioner
340566 495000.0 Unaccompanied Working
... ... ... ...
235730 1597500.0 Unaccompanied Commercial associate
367911 477000.0 Unaccompanied Working
302585 679500.0 Unaccompanied Commercial associate
102961 1129500.0 Unaccompanied Pensioner
125815 630000.0 Unaccompanied Working
NAME_EDUCATION_TYPE NAME_FAMILY_STATUS \
SK_ID_CURR
112958 Higher education Married
238225 Secondary / secondary special Married
269335 Secondary / secondary special Single / not married
450836 Higher education Married
340566 Secondary / secondary special Married
... ... ...
235730 Secondary / secondary special Married
367911 Secondary / secondary special Married
302585 Higher education Separated
102961 Higher education Married
125815 Incomplete higher Single / not married
NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH \
SK_ID_CURR
112958 House / apartment 0.035792 -19647
238225 House / apartment 0.018209 -14789
269335 House / apartment 0.046220 -21913
450836 House / apartment 0.035792 -15868
340566 House / apartment 0.018850 -13855
... ... ... ...
235730 House / apartment 0.072508 -10263
367911 House / apartment 0.022625 -19451
302585 Office apartment 0.019101 -16577
102961 House / apartment 0.026392 -20665
125815 House / apartment 0.018029 -8156
DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE \
SK_ID_CURR
112958 -1240 -7902.0 -3208 NaN
238225 -7774 -1392.0 -135 NaN
269335 365243 -8461.0 -4290 NaN
450836 365243 -9291.0 -4629 19.0
340566 -2816 -6911.0 -4996 6.0
... ... ... ... ...
235730 -2904 -5059.0 -1356 NaN
367911 -296 -5586.0 -2992 1.0
302585 -2067 -70.0 -76 1.0
102961 365243 -4180.0 -4224 NaN
125815 -1494 -3034.0 -844 NaN
FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE \
SK_ID_CURR
112958 1 1 1 1
238225 1 1 0 1
269335 1 0 0 1
450836 1 0 0 1
340566 1 1 1 1
... ... ... ... ...
235730 1 1 0 1
367911 1 1 0 1
302585 1 1 0 1
102961 1 0 0 1
125815 1 1 0 1
FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS \
SK_ID_CURR
112958 0 0 SIN VALOR 2.0
238225 1 0 Core staff 3.0
269335 0 0 SIN VALOR 1.0
450836 0 0 SIN VALOR 2.0
340566 1 1 Core staff 2.0
... ... ... ... ...
235730 0 1 High skill tech staff 3.0
367911 1 0 Laborers 2.0
302585 0 0 SIN VALOR 1.0
102961 0 0 SIN VALOR 2.0
125815 0 0 Managers 1.0
REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY \
SK_ID_CURR
112958 2 2
238225 3 3
269335 1 1
450836 2 2
340566 2 2
... ... ...
235730 1 1
367911 2 2
302585 2 2
102961 2 2
125815 3 3
WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START \
SK_ID_CURR
112958 FRIDAY 12
238225 THURSDAY 12
269335 TUESDAY 10
450836 THURSDAY 11
340566 WEDNESDAY 12
... ... ...
235730 MONDAY 12
367911 THURSDAY 10
302585 FRIDAY 9
102961 THURSDAY 11
125815 FRIDAY 5
REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION \
SK_ID_CURR
112958 0 0
238225 0 0
269335 0 0
450836 0 0
340566 0 0
... ... ...
235730 0 0
367911 0 0
302585 0 0
102961 0 0
125815 0 0
LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY \
SK_ID_CURR
112958 0 0
238225 0 0
269335 0 0
450836 0 0
340566 0 0
... ... ...
235730 0 0
367911 0 0
302585 0 0
102961 0 0
125815 0 0
REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY \
SK_ID_CURR
112958 0 0
238225 1 1
269335 0 0
450836 0 0
340566 1 1
... ... ...
235730 0 0
367911 0 0
302585 0 0
102961 0 0
125815 1 1
ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 \
SK_ID_CURR
112958 School NaN 0.673949 0.694093
238225 Transport: type 2 NaN 0.626277 NaN
269335 XNA NaN 0.733055 0.511892
450836 XNA 0.338143 0.566085 0.349055
340566 Government 0.573164 0.200526 0.144648
... ... ... ... ...
235730 Business Entity Type 3 0.648827 0.742543 NaN
367911 Business Entity Type 3 NaN 0.573798 0.597192
302585 Other 0.446338 0.702760 0.710674
102961 XNA NaN 0.685977 0.723837
125815 Other 0.163182 0.408982 0.474051
APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG \
SK_ID_CURR
112958 0.0629 0.0520 0.9796
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.1017 0.0924 0.9776
367911 0.1031 NaN 0.9806
302585 NaN NaN NaN
102961 0.1639 NaN 0.9940
125815 0.0825 NaN 0.9811
YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG \
SK_ID_CURR
112958 0.7212 0.008 0.0000 0.1379
238225 NaN NaN NaN NaN
269335 NaN NaN NaN NaN
450836 NaN NaN NaN NaN
340566 NaN NaN NaN NaN
... ... ... ... ...
235730 0.6940 0.000 0.0132 0.1434
367911 NaN NaN NaN 0.2069
302585 NaN NaN NaN NaN
102961 NaN NaN 0.0800 0.0345
125815 0.7416 NaN 0.0000 0.1379
FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG \
SK_ID_CURR
112958 0.1667 0.2083 0.0438 0.0513
238225 NaN NaN NaN NaN
269335 NaN NaN NaN NaN
450836 NaN NaN NaN NaN
340566 NaN NaN NaN NaN
... ... ... ... ...
235730 0.2429 0.0417 0.0000 0.0828
367911 0.1667 NaN NaN NaN
302585 NaN NaN NaN NaN
102961 0.6250 NaN NaN NaN
125815 0.1667 0.1667 0.0110 NaN
LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG \
SK_ID_CURR
112958 0.0550 0.0000 0.0000
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.0906 0.0006 0.0001
367911 0.0914 NaN NaN
302585 NaN NaN NaN
102961 0.1354 NaN 0.0163
125815 0.0518 NaN 0.0922
APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE \
SK_ID_CURR
112958 0.0641 0.054 0.9796
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.1050 0.108 0.9762
367911 0.1050 NaN 0.9806
302585 NaN NaN NaN
102961 0.1670 NaN 0.9940
125815 0.0840 NaN 0.9811
YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE \
SK_ID_CURR
112958 0.7321 0.0081 0.0000 0.1379
238225 NaN NaN NaN NaN
269335 NaN NaN NaN NaN
450836 NaN NaN NaN NaN
340566 NaN NaN NaN NaN
... ... ... ... ...
235730 0.6864 0.0000 0.0000 0.1724
367911 NaN NaN NaN 0.2069
302585 NaN NaN NaN NaN
102961 NaN NaN 0.0806 0.0345
125815 0.7517 NaN 0.0000 0.1379
FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE \
SK_ID_CURR
112958 0.1667 0.2083 0.0448
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.1667 0.0417 0.0000
367911 0.1667 NaN NaN
302585 NaN NaN NaN
102961 0.6250 NaN NaN
125815 0.1667 0.1667 0.0113
LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE \
SK_ID_CURR
112958 0.0560 0.0573 0.0
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.0918 0.0748 0.0
367911 NaN 0.0952 NaN
302585 NaN NaN NaN
102961 NaN 0.1410 NaN
125815 NaN 0.0540 NaN
NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI \
SK_ID_CURR
112958 0.0000 0.0635 0.052
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.0000 0.1041 0.104
367911 NaN 0.1041 NaN
302585 NaN NaN NaN
102961 0.0172 0.1655 NaN
125815 0.0976 0.0833 NaN
YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI \
SK_ID_CURR
112958 0.9796 0.7249 0.008
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.9762 0.6780 0.000
367911 0.9806 NaN NaN
302585 NaN NaN NaN
102961 0.9940 NaN NaN
125815 0.9811 0.7451 NaN
ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI \
SK_ID_CURR
112958 0.00 0.1379 0.1667 0.2083
238225 NaN NaN NaN NaN
269335 NaN NaN NaN NaN
450836 NaN NaN NaN NaN
340566 NaN NaN NaN NaN
... ... ... ... ...
235730 0.00 0.1724 0.1667 0.0417
367911 NaN 0.2069 0.1667 NaN
302585 NaN NaN NaN NaN
102961 0.08 0.0345 0.6250 NaN
125815 0.00 0.1379 0.1667 0.1667
LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI \
SK_ID_CURR
112958 0.0446 0.0522 0.0560
238225 NaN NaN NaN
269335 NaN NaN NaN
450836 NaN NaN NaN
340566 NaN NaN NaN
... ... ... ...
235730 0.0000 0.0855 0.0918
367911 NaN NaN 0.0930
302585 NaN NaN NaN
102961 NaN NaN 0.1378
125815 0.0112 NaN 0.0528
NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI \
SK_ID_CURR
112958 0.0 0.0000
238225 NaN NaN
269335 NaN NaN
450836 NaN NaN
340566 NaN NaN
... ... ...
235730 0.0 0.0000
367911 NaN NaN
302585 NaN NaN
102961 NaN 0.0166
125815 NaN 0.0941
FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE \
SK_ID_CURR
112958 reg oper account block of flats 0.0433
238225 SIN VALOR SIN VALOR NaN
269335 SIN VALOR SIN VALOR NaN
450836 SIN VALOR SIN VALOR NaN
340566 SIN VALOR SIN VALOR NaN
... ... ... ...
235730 reg oper account block of flats 0.0565
367911 SIN VALOR block of flats 0.0797
302585 SIN VALOR SIN VALOR NaN
102961 SIN VALOR block of flats 0.1100
125815 reg oper spec account block of flats 0.0608
WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE \
SK_ID_CURR
112958 Panel No 3.0
238225 SIN VALOR SIN VALOR 1.0
269335 SIN VALOR SIN VALOR 1.0
450836 SIN VALOR SIN VALOR 5.0
340566 SIN VALOR SIN VALOR 0.0
... ... ... ...
235730 Block No 0.0
367911 SIN VALOR No 0.0
302585 SIN VALOR SIN VALOR 0.0
102961 Monolithic No 7.0
125815 Panel No 4.0
DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE \
SK_ID_CURR
112958 0.0 3.0
238225 0.0 1.0
269335 0.0 1.0
450836 0.0 5.0
340566 0.0 0.0
... ... ...
235730 0.0 0.0
367911 0.0 0.0
302585 0.0 0.0
102961 0.0 7.0
125815 1.0 4.0
DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 \
SK_ID_CURR
112958 0.0 -1103.0 0
238225 0.0 -3107.0 0
269335 0.0 -894.0 0
450836 0.0 -849.0 0
340566 0.0 -1563.0 0
... ... ... ...
235730 0.0 -1902.0 0
367911 0.0 -2635.0 0
302585 0.0 -517.0 0
102961 0.0 0.0 0
125815 1.0 -1197.0 0
FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 \
SK_ID_CURR
112958 0 0 0
238225 1 0 0
269335 0 0 0
450836 1 0 0
340566 0 0 0
... ... ... ...
235730 0 0 0
367911 1 0 0
302585 1 0 0
102961 0 0 0
125815 1 0 0
FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 \
SK_ID_CURR
112958 0 0 0
238225 0 0 0
269335 1 0 0
450836 0 0 0
340566 0 0 0
... ... ... ...
235730 0 0 1
367911 0 0 0
302585 0 0 0
102961 1 0 0
125815 0 0 0
FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 \
SK_ID_CURR
112958 0 0 0
238225 0 0 0
269335 0 0 0
450836 0 0 0
340566 0 0 0
... ... ... ...
235730 0 0 0
367911 0 0 0
302585 0 0 0
102961 0 0 0
125815 0 0 0
FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 \
SK_ID_CURR
112958 0 0 0
238225 0 0 0
269335 0 0 0
450836 0 0 0
340566 0 0 0
... ... ... ...
235730 0 0 0
367911 0 1 0
302585 0 0 0
102961 0 0 0
125815 0 0 0
FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 \
SK_ID_CURR
112958 0 0 0
238225 0 0 0
269335 0 0 0
450836 0 0 0
340566 0 0 0
... ... ... ...
235730 0 0 0
367911 0 0 0
302585 0 0 0
102961 0 0 0
125815 0 0 0
FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 \
SK_ID_CURR
112958 1 0 0
238225 0 0 0
269335 0 0 0
450836 0 0 0
340566 0 0 0
... ... ... ...
235730 0 0 0
367911 0 0 0
302585 0 0 0
102961 0 0 0
125815 0 0 0
FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR \
SK_ID_CURR
112958 0 0.0
238225 0 0.0
269335 0 0.0
450836 0 0.0
340566 0 0.0
... ... ...
235730 0 0.0
367911 0 0.0
302585 0 0.0
102961 0 0.0
125815 0 0.0
AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK \
SK_ID_CURR
112958 0.0 0.0
238225 0.0 0.0
269335 0.0 0.0
450836 0.0 0.0
340566 0.0 0.0
... ... ...
235730 0.0 0.0
367911 0.0 0.0
302585 0.0 0.0
102961 0.0 0.0
125815 0.0 0.0
AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT \
SK_ID_CURR
112958 0.0 0.0
238225 0.0 1.0
269335 0.0 0.0
450836 0.0 0.0
340566 1.0 0.0
... ... ...
235730 1.0 0.0
367911 0.0 0.0
302585 0.0 0.0
102961 2.0 1.0
125815 0.0 0.0
AMT_REQ_CREDIT_BUREAU_YEAR TARGET
SK_ID_CURR
112958 1.0 0
238225 0.0 0
269335 2.0 0
450836 2.0 0
340566 5.0 0
... ... ...
235730 2.0 0
367911 0.0 0
302585 1.0 0
102961 3.0 0
125815 1.0 0
[61503 rows x 121 columns]
pd_loan_train.to_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
pd_loan_test.to_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")